forked from mattb112885/clusterDbAnalysis
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcheckInputFormat.sh
executable file
·204 lines (184 loc) · 7.56 KB
/
checkInputFormat.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
#!/bin/bash
# Check for existence and formatting of required input files.
# On success, will give no errors and retrun 0
#
STATUS=0
# Check existence of organism file. It will be automatically generated
# before calling this function but if something goes wrong this will alert us to it.
echo "Checking for existence of organisms file..."
if [ ! -f "organisms" ]; then
echo "ERROR: organisms file not found in expected location";
STATUS=1
fi
# Check that the organism ID is the second column
echo "Checking format of organism ID in organisms file..."
orgmatch=$(cat organisms | cut -f 2 | grep -P "^\d+\.\d+$")
if [ $? -eq 1 ]; then
echo 'ERROR: Organism IDs not found or not in the expected format (second column of organisms file, and must have format #.# i.e. 83333.1 for E coli)';
STATUS=1
fi
echo "Checking for uniqueness of organisms in organisms file..."
nl=$(cat organisms | sort | wc -l)
numname=$(cat organisms | cut -f 1 | sort -u | wc -l);
numid=$(cat organisms | cut -f 2 | sort -u | wc -l);
if [ "${nl}" != "${numname}" ]; then
echo 'ERROR: Organism names must be unique (first column of organisms file)';
STATUS=1;
fi
if [ "${nl}" != "${numid}" ]; then
echo 'ERROR: Organism IDs must be unique (second column of organisms file)';
STATUS=1;
fi
# Check for bad characters in organism names
# Quotes are particularly evil...
badchars=$(cat organisms | grep -P "[\'\|]")
if [ $? -eq 0 ]; then
echo 'ERROR: Bad characters (quotes) found in organism names in the organism file.'
STATUS=1;
fi
# Check existence of groups file (for clustering)
echo "Checking existence of groups file..."
if [ ! -f "groups" ]; then
echo 'ERROR: groups file not found in expected location';
STATUS=1;
fi
echo "Checking for uniqueness of groups..."
ng=$(cat groups | sort | wc -l);
numgroups=$(cat groups | sort -u | wc -l);
if [ "${ng}" != "${numgroups}" ]; then
echo 'ERROR: Group names must be unique';
STATUS=1;
fi
# Check whether the organisms are found in the groups file
# (this isn't a fatal error if it fails)
echo "Checking whether organisms are found in the groups file..."
orgnames=$(cat organisms | cut -f 1);
for orgname in "${orgnames}"; do
ok=$(grep -F "${orgname}" groups)
if [ $? -eq 1 ]; then
echo "WARNING: Organism ${orgname} was in the organisms file but did not appear in any groups. Did you forget something?"
fi
done
### RAW file tests
cd raw;
echo "Checking the format of raw file names..."
for file in $(ls | grep -v "README"); do
ok=$(echo "${file}" | grep -x -P "\d+\.\d+\.txt")
if [ $? -eq 1 ]; then
echo "ERROR: Raw file ${file} does not have a name in the expected format ( [organismid].txt)"
STATUS=1
fi
done
echo "Checking for existence of appropriately-named raw files for each organism in the organisms file..."
for org in ${orgmatch}; do
# File name must exactly be [organismID].txt
fmatch=$(ls | grep -w -F "${org}.txt");
if [ $? -eq 1 ]; then
echo "ERROR: No raw file match for organism ID ${org} - file name must be ${org}.txt";
STATUS=1;
fi
done
echo "Checking that all raw file organism IDs have an entry in the organisms file..."
for file in $(ls | grep -v "README"); do
orgid=$(echo "${file}" | grep -o -P "\d+\.\d+");
ok=$(cat ../organisms | cut -f 2 | grep -F -w "${orgid}")
if [ $? -eq 1 ]; then
echo "ERROR: Organism ID in raw file ${file} has no entry in the organism file"
STATUS=1
fi
done
echo "Checking formatting of each raw file..."
for file in $(ls | grep -v "README"); do
# Note - all of these check for the existence of ONE thing with the right format in each column (they don't check that ALL of the rows are the right format)
# I dont check the following things that are still useful:
# contig (column 1) - no specific format required
# function (column 8) - no specific format required
# The following columns are never used by my programs:
# column 4 (location) - use columns 1,5,6, and 7 instead.
# column 9 (aliases) - use the "aliases" file instead.
# column 10 (figfam)
# column 11 (evidence codes)
fmatch=$(cat "${file}" | cut -f 2 | grep -o -P "^fig\|\d+\.\d+\.peg\.\d+$");
if [ $? -eq 1 ]; then
echo "ERROR: Gene IDs in raw file ${file} were not in expected format (fig|#.#.peg.# where the first two are the organism ID) or not in the expected place (second column in raw file)";
STATUS=1;
fi
expectedorg=$(echo "${file}" | grep -o -P "\d+\.\d+")
actualorg=$(cat "${file}" | cut -f 2 | tail -1 | grep -o -P "\d+\.\d+")
if [ "${expectedorg}" != "${actualorg}" ]; then
echo "ERROR: Gene ID ${expectedorg} in the name of raw file ${file} does not match the organism ID ${actualorg} within the file itself."
STATUS=1;
fi
fmatch=$(cat "${file}" | cut -f 3 | grep -o -P "^peg$");
if [ $? -eq 1 ]; then
echo "ERROR: No objects of type peg (third column) identified in file ${file}. Only pegs (protein encoding genes) are considered in our clustering analysis!";
STATUS=1;
fi
fmatch=$(cat "${file}" | cut -f 5 | grep -o -P "^\d+$");
if [ $? -eq 1 ]; then
echo "ERROR: Gene start location (fifth column) expected to be a number in file ${file}";
STATUS=1;
fi
fmatch=$(cat "${file}" | cut -f 6 | grep -o -P "^\d+$");
if [ $? -eq 1 ]; then
echo "ERROR: Stop location (sixth column) expected to be a number in file ${file}";
STATUS=1;
fi
fmatch=$(cat "${file}" | cut -f 7 | grep -o -P "^[+-]$");
if [ $? -eq 1 ]; then
echo "ERROR: Strand (seventh column) must be + or - in file ${file}";
STATUS=1;
fi
# Note - NRWYMKSHBVD are ambiguous nucleotides
# ACGT are the standard nucleotides
# Anything that isn't one of these is an error.
# No gaps are allowed.
fmatch=$(cat "${file}" | cut -f 12 | grep -o -i -P "^[acgtnrwymkshbvd]+$");
if [ $? -eq 1 ]; then
echo "ERROR: Nucleotide sequence expected in 12th column in file ${file}";
STATUS=1;
fi
# Note this wont match the header because of the "_" in aa_sequences but its a bit fragile.
# The \s is there because the AA sequences can have spaces in them (I will remove this
# possibility if I see that it causes problems)
fmatch=$(cat "${file}" | cut -f 13 | grep -o -i -P "^[A-Z\s]+$")
if [ $? -eq 1 ]; then
echo "ERROR: Amino acid sequence expected in 13th column in file ${file}";
STATUS=1;
fi
done
cd ..;
### Check genbank files.
cd genbank;
echo "Checking the format of genbank file names..."
for file in $(ls | grep -v "README"); do
ok=$(echo "${file}" | grep -x -P "\d+\.\d+\.gbk")
if [ $? -eq 1 ]; then
echo "ERROR: Genbank file ${file} does not have a name in the expected format ([organismid].gbk)"
STATUS=1
fi
done
echo "Checking for existence of genbank files for every organism in the organisms file..."
for org in ${orgmatch}; do
# File name must exactly be [organismID].gbk
fmatch=$(ls | grep -w -F "${org}.gbk");
if [ $? -eq 1 ]; then
echo "ERROR: No genbank file match for organism ID ${org} - file name must be ${org}.gbk and placed in the genbank folder";
STATUS=1;
fi
done
echo "Checking that all organism IDs in the genbank files have an entry in the organisms file..."
for file in $(ls | grep -v "README"); do
orgid=$(echo "${file}" | grep -o -P "\d+\.\d+");
ok=$(cat ../organisms | cut -f 2 | grep -F -w "${orgid}")
if [ $? -eq 1 ]; then
echo "ERROR: Organism ID ${orgid} in genbank file ${file} has no entry in the organism file"
STATUS=1
fi
done
cd ..;
# The aliases file is optional but recommended!
if [ ! -f ./aliases/aliases ]; then
echo "WARNING: No aliases file found - no alias subsitution will be performed for gene names"
fi
exit ${STATUS}