forked from stenglein-lab/btv_sequencing
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathidentify_best_segments_from_sam
executable file
·127 lines (100 loc) · 3 KB
/
identify_best_segments_from_sam
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
#!/usr/bin/perl
#
# This script identifies the best aligned-to BTV reference sequences
#
# Mark Stenglein July 27, 2012
#
use strict;
use Getopt::Long;
my $usage = <<USAGE;
This script identifies the best aligned-to BTV reference sequences
Inputs: fasta file of BTV ref seqs, and
sam file of reads aligned to those ref seqs
usage: $0 [-h] [-u] <fasta_file_used_to_create_bt_index> <sam_file>
-h print this message
USAGE
my $print_usage = 0;
my $output_unmapped = 0;
if (scalar @ARGV != 2) { print $usage; exit; }
GetOptions ("h" => \$print_usage);
if ($print_usage) { print $usage; exit; }
my $fasta_file = shift or print $usage and die($!);
open (my $fasta_fh, "<", $fasta_file) or print $usage and die("error: couldn't open fasta file $fasta_file\n$!\n");
my $sam_file = shift or print $usage and die($!);
open (my $sam_fh, "<", $sam_file) or print $usage and die("error: couldn't open sam file $sam_file\n$!\n");
my $flag = 0;
my @fields = ();
my @tags = ();
my %subject_id_tally = ();
# iterate through sam file and keep track of how many times each segment (each index subject)
# gets a read aligned to it
while (<$sam_fh>)
{
chomp;
if (/^@/)
{
# header lines - ignore them
next;
}
# split line into tab-delimited components
@fields = split "\t";
# is this an unmapped query?
if (!$output_unmapped)
{
my $flag = $fields[1];
# don't output unmapped queries
# see SAM format spec.
if ($flag & 4) { next; }
}
# create tally of aligned-to subject IDs
my $subject_id = $fields[2];
$subject_id_tally{$subject_id} += 1;
}
my @segment_numbers = (1..10);
my %best_aligned_to_segment_ids = ();
# for each segment #1 -> #10
foreach my $segment (@segment_numbers)
{
# iterate through a sorted list of aligned-to subject IDs and stop once we reach the most aligned-to for
# this particular segment
foreach my $subject_id (sort {$subject_id_tally{$b} <=> $subject_id_tally{$a}} keys %subject_id_tally)
{
# my $subject_id = $subject_id_tally{$subject_id};
# print "$subject_id\t$hit_count\n";
if ($subject_id =~ /s(\d+)_/)
{
my $this_segment_number = $1;
if ($this_segment_number == $segment)
{
warn "$segment: $subject_id\n"; # output to stderr
$best_aligned_to_segment_ids{$subject_id} = 1; # keep track of best segment IDs
last; # break out of for loop
}
}
else
{
die ("error: couldn't parse segment # for subject ID: $subject_id\n");
}
}
}
# now parse through fasta file and output seqs if they are one of the best 10
my $printing_lines = 0;
while (<$fasta_fh>)
{
chomp;
# fasta header line
if (/^>(.+)/)
{
$printing_lines = 0;
my $seq_id = $1;
# is this one of the best segments? If so, we'll print it out
if ($best_aligned_to_segment_ids{$seq_id})
{
$printing_lines = 1;
}
}
if ($printing_lines)
{
print "$_\n";
}
}