-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsra_ITS_metagenome_miner.pl
248 lines (198 loc) · 6.95 KB
/
sra_ITS_metagenome_miner.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
#!/usr/bin/perl
# This program is used to search for specific ITS (or other) sequences in the SRA database using a mapping approach bwa
# Requires sratoolkit, bwa, samtools, ITSx, mafft
# Expected input is a fasta file of ITS sequences
# A list of SRA data in a text format with one experiments (acc. num.) per line.
# Make sure ITSx is in your $PATH or modify below
# In order not to fill up your home directory you need to redirect caching by sratoolkit or disable it
# Code from https://standage.github.io/that-darn-cache-configuring-the-sra-toolkit.html
# mkdir -p ~/.ncbi
# echo '/repository/user/main/public/root = "/scratch/standage/sra-cache"' > ~/.ncbi/user-settings.mkfg
# Uncomment the next command if you want to disable network access altogether
# echo '/repository/user/cache-disabled = "true"' > ~/.ncbi/user-settings.mkfg
# I disabled caching completely
# open input and output files and 3rd argument which is y or n for whether it is a TLS (targeted locus study)
$inputfasta = $ARGV[0];
$inputSamples = $ARGV[1];
# Directory for the fastq data
$fastdirectory = '/scratch/tyjames_root/tyjames0/tyjames/Mala/03.07.23/temp3/data';
# Name of log file for fasterq-dump
$dumpfilelog = 'fasterq_dump.log';
if (@ARGV < 1) {
die "Improper number of arguments. Usage: perl sra_ITS_miner.pl inputfastafile inputSRAlist\n";
}
##############################
######### RUN ITSX ###########
##############################
# name for ITSx output file
$splitfasta = 'ITSx_out.concat.fasta';
my $cmd = 'ITSx -i ' . $inputfasta . ' -t Fungi';
print $cmd;
# ITSx -i targets_unsplit.fasta -o targets_ITSx -t Fungi
system($cmd);
# ITSx puts the sequences separately into two files ITSx_out.ITS1.fasta and ITSx_out.ITS2.fasta
$cmd = 'cat ITSx_out.ITS1.fasta ITSx_out.ITS2.fasta >ITSx_out.concat.fasta';
system($cmd);
# Next we will need to create an indexed file for bwa
$cmd = 'bwa index -p ITSx_bwa_reference ITSx_out.concat.fasta';
system($cmd);
# Extract sequence names from fasta file
my @seqnames;
open (INPUTFASTA, $splitfasta) or die $!;
while (<INPUTFASTA>) {
if ( /^>/ ) {
# get the ID from the header
my @header = split /\s/, $_;
$header[0] =~ s/\>//;
push (@seqnames, $header[0]);
}
}
$numseqs = scalar(@seqnames);
print "\n$numseqs query sequences parsed. End ITSx.\n\n";
# close the file
close INPUTFASTA;
##############################
## PRINT OUT BIOSAMPLE DATA ##
##############################
open (OUTPUTTABLE, ">sra_ITS_miner_output.txt");
print OUTPUTTABLE "SRA_Expt\tData_size\tRead_length";
foreach (@seqnames) {
print OUTPUTTABLE "\t$_";
# also open file with the filename using the sequence name to dump the hits
# as well as a file with the metadata for each sequence search
$outfilename = $_;
$outfilename =~ s/\|/\_/g;
$outfilename = $outfilename . '.fasta_list';
open(FH, '>', $outfilename) or die $!;
}
print OUTPUTTABLE "\n";
# Open sample data set and read in line by line each study
open (SAMPLETABLE, $inputSamples) or die $!;
# Go through table of samples
while (<SAMPLETABLE>)
{
$experiment = $_;
chomp($experiment);
$experiment =~ s/\s+$//;
print OUTPUTTABLE "$experiment\t";
##############################
###### DOWNLOAD FASTQ/A ######
##############################
# Use sratoolkit to download fastq file
# Reads are not treated as paired and separate, even if they are
# This may lead to double counting. May need to be dealt with later
# name for the prefetch file
$srafile = $fastdirectory . '/' . $experiment . '/' . $experiment . '.sra';
# First download the data using prefetch
until (-e $srafile)
{
$cmd = "prefetch -v -p -X 500g -f all -C yes " . $experiment . " -O " . $fastdirectory;
print "$cmd\n";
system($cmd);
}
# name for the fastq/a file
$fastfile = $experiment . '.fastq'; # currentlyused
# Next download the reads via fasterq-dump
# -e = num threads
# -f to overwrite existing file
$cmd = "fasterq-dump --split-spot -e 8 -f -o " . $fastfile . ' ' . $srafile . " 2>" . $dumpfilelog;
print "$cmd\n";
system($cmd);
# Now we will get the number of sequences and the length of each read
open (DUMPLOGFILE, $dumpfilelog);
while (<DUMPLOGFILE>)
{
print $_;
if ($_ =~ /reads read/)
{
@data = split /\s+/, $_;
$readnum = $data[3];
}
}
print "readnum=$readnum\n";
$cmd = "head -n 2 " . $fastfile . " | tail -1 | wc -c";
my $seqlength = `$cmd`;
chomp($seqlength);
print "seqlength=$seqlength\n";
# Check filesize which could prevent large files from being searched. Uses a lot of memory
my $filesize = -s $fastfile;
# Make a hash of each seq name & set count for each to zero for each experiment
foreach (@seqnames)
{
$count_hits{$_} = 0;
}
# Now print the data size and read length in output table
print OUTPUTTABLE "\t$readnum\t$seqlength";
print "\nFinished downloading\n\n";
##############################
############ BWA #############
##############################
# Now use bwa to map reads to indexed file with the concatenated 5.8S removed sequences
# First checking if the file isn't massive, not used for this script so much
if ($filesize < 1e12)
{
# bwa -t = threads
# bwa -L = clipping penalty. High numbers mean that match needs to cover most of the read
# bwa -T = Threshold for reporting score.
# samtools -F 4 reports only matches
$samfile = $experiment . '.sam';
$cmd = 'bwa mem -t 8 -L 100 -T 32 -O 3 ITSx_bwa_reference ' . $fastfile . ' | samtools view -F 4 > ' . $samfile;
print $cmd;
system($cmd);
# Now open samfile output
open (SAMF, $samfile);
# Read in each line and check if it matches a hit
while (<SAMF>)
{
$line = $_;
@targethit = split (/\t/, $_);
foreach (@seqnames)
{
$tempoutfilename = $_;
# Replace hashes with underscores
$tempoutfilename =~ s/\|/\_/g;
$outfastalistfilename = $tempoutfilename . '.fasta_list';
open (FLFILE, '>>', $outfastalistfilename) or die $!;
$name = $_;
# \Q allows match literal regardless of weird characters
if ($line =~ /\Q$name/)
{
$count_hits{$_}++;
# Pulls fasta name from sam file, not used;
$seqdesired = $targethit[1];
print FLFILE "$experiment\t$line";
}
}
}
}
else
{
$analyzed = "no";
}
print "\nFinished bwa\n\n";
##############################
### PRINT COUNTS AND SEQS ####
##############################
# Print # observations of each sequence
if ($analyze eq 'no')
{
foreach (@seqnames)
{
print OUTPUTTABLE "\tNA";
}
}
else
{
foreach (@seqnames)
{
print OUTPUTTABLE "\t$count_hits{$_}";
}
# Close line in outputtable
}
print OUTPUTTABLE "\n";
# Can remove fastq and sra files to save space
$cmd = "rm " . $fastfile;
system ($cmd);
$cmd = "rm " . $srafile;
system ($cmd);
}