-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsmips.pl
958 lines (850 loc) · 34.5 KB
/
smips.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
#!/usr/bin/perl
##############################################################
# SMIPS - (S)econdary (M)etablolites from (I)nter(P)ro(S)can #
##############################################################
# Copyright (C) 2015 Leibniz Institute for Natural Product Research and
# Infection Biology -- Hans-Knoell-Institute (HKI)
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
# Dieses Programm ist Freie Software: Sie können es unter den Bedingungen
# der GNU General Public License, wie von der Free Software Foundation,
# Version 3 der Lizenz oder (nach Ihrer Wahl) jeder neueren
# veröffentlichten Version, weiterverbreiten und/oder modifizieren.
#
# Dieses Programm wird in der Hoffnung, dass es nützlich sein wird, aber
# OHNE JEDE GEWÄHRLEISTUNG, bereitgestellt; sogar ohne die implizite
# Gewährleistung der MARKTFÄHIGKEIT oder EIGNUNG FÜR EINEN BESTIMMTEN ZWECK.
# Siehe die GNU General Public License für weitere Details.
#
# Sie sollten eine Kopie der GNU General Public License zusammen mit diesem
# Programm erhalten haben. Wenn nicht, siehe <http://www.gnu.org/licenses/>.
#
# Contact: [email protected], [email protected]
use 5.14.2;
use strict;
use warnings;
use Getopt::Long;
use List::Util qw(any);
use Scalar::Util qw(looks_like_number);
use Sort::Naturally;
use IPC::System::Simple qw(system); # improved behavior of Perl's "system()" call
use File::Which; # emulation of "which" command line tool
use File::Basename;
use Bio::SeqIO;
use Data::Dumper;
$Data::Dumper::Terse = 1;
$Data::Dumper::Sortkeys = 1;
use Text::Autoformat; # wrap text automatically (especially helpful for usage text)
use Text::CSV; # check if Text::CSV_XS is also installed, for huge gain in reading/writing speed
my $tsv = Text::CSV->new(
{
allow_loose_quotes => 1, # bad practice in csv format (blame the creator of the input file?)
allow_loose_escapes => 1, # bad practice in csv format (blame the creator of the input file?)
sep_char => "\t",
eol => "\n",
binary => 1
}
);
######################
# Domain assignments #
######################
sub domain_assignments { }
my %iprs = (
### PKS ###
# "general" PKS domains
IPR015083 => "xPKS",
# IPR011141 => "pks", (only in PIRSF, plant PKS type III)
IPR013601 => "xPKS",
IPR006765 => "Cyc",
IPR014043 => "AT", # also FAS beta subunit in Candida dubliniensis
IPR020801 => "AT",
IPR016035 => "AT", # also FAS beta subunit in Candida dubliniensis
IPR016036 => "AT", # sounds like ACP/PP, but is small part of AT
IPR014030 => "KS",
IPR014031 => "KS",
IPR020841 => "KS",
IPR018201 => "KS",
IPR013968 => "KR",
IPR020842 => "KR",
IPR020807 => "DH",
IPR020843 => "ER",
IPR011032 => "ER",
IPR013154 => "ER",
IPR013149 => "ER",
IPR013217 => "MT",
IPR013216 => "MT",
IPR004033 => "MT",
IPR020803 => "MT",
IPR029063 => "MT",
### NRPS ###
# "general" NRPS domains
IPR010060 => "xNRPS",
IPR013624 => "xNRPS",
IPR012728 => "xNRPS",
IPR010071 => "A",
IPR000873 => "A",
IPR020845 => "A",
IPR025110 => "A",
IPR001242 => "C",
### PKS or NRPS ###
IPR009081 => "PP",
IPR020806 => "PP",
IPR006162 => "PP",
IPR003231 => "PP",
IPR008278 => "PP",
IPR004568 => "PP",
IPR001031 => "TE",
IPR020802 => "TE",
IPR010080 => "TE",
### DMATS ###
IPR017795 => "Prenyltransferase",
IPR012148 => "Prenyltransferase",
IPR017796 => "Prenyltransferase",
IPR000537 => "Prenyltransferase",
);
my %domains = (
xPKS => "PKS",
Cyc => "PKS",
AT => "PKS",
KS => "PKS",
KR => "PKS",
DH => "PKS",
ER => "PKS",
# MT => "PKS", # can also be found in NRPSs, hence MT does not necessarily result in PKS or hybrid anchor gene
xNRPS => "NRPS",
A => "NRPS",
C => "NRPS",
PP => "PKS/NRPS",
TE => "PKS/NRPS",
MT => "PKS/NRPS", # see MT
Prenyltransferase => "DMATS",
);
sub usage()
{
my $usage = autoformat(
"Copyright (C) 2015 Leibniz Institute for Natural Product Research and Infection Biology -- Hans-Knoell-Institute (HKI)\n\n"
. "This program comes with ABSOLUTELY NO WARRANTY. This is free software, and you are welcome to redistribute it under certain conditions. See the file COPYING, which you should have received along with this program, for details.\n\n"
. " ##############################################################\n"
. " # SMIPS - (S)econdary (M)etablolites from (I)nter(P)ro(S)can #\n"
. " ##############################################################\n\n"
. "SMIPS is a tool to predict secondary metabolite (SM) anchor (or backbone) genes in protein sequences. These enzymes play a major role in synthesizing secondary metabolites and their genes are often clustered together with others participating in the same metabolic pathway. Common SM anchor genes are Polyketide synthases (PKS) and Non-ribosomal peptide synthetases (NRPS). The predictions are based on protein domain annotations provided by InterProScan.",
{ justify => "full", all => 1 }
)
. "\n\n"
. "usage: smips.pl [ options ] [ <interproscan_file.[tsv|out|tab]> | <protein_sequences.fasta> ]\n\n"
. autoformat(
"options:\n\n"
. " --path-to-interproscan, -p <path>\n\n"
. " Path to InterProScan software, including the file name. Usually the file name is \"interproscan.sh\". If not specified, SMIPS uses the \"which\" command to figure out the location of InterProScan. Only needed if a FASTA file is given.\n\n"
. "\n"
. "SMIPS accepts one (many) InterProScan output file(s) in TSV or OUT format, or the corresponding JGI files in TAB format, as input. It analyses them for typical SM anchor/backbone genes and writes their domain structure, along with additionl information, to two files in CSV format, extending the file name(s) of the input file(s).\n\n"
. "As an alternative, SMIPS accepts protein sequence files in FASTA format. If InterProScan is installed on your machine, SMIPS first calls InterProScan with the given FASTA file(s) and than proceeds with the resulting TSV file(s).\n\n"
. "If you don't have a proper InterProScan output at hand, you may download and install InterProScan from",
{ justify => "full", all => 1 }
)
. "\n"
. " http://www.ebi.ac.uk/interpro/download.html\n" . "\n"
. autoformat(
"and apply the following command-line to get the desired file (SMIPS will apply the very same command-line, if you run it with a FASTA file)",
{ justify => "full", all => 1 }
)
. "\n"
# . " interproscan.sh -appl PrositePatterns,SuperFamily,PfamA,SMART,TIGRFAM -i <fasta file with protein sequences> -f tsv,svg -iprlookup\n" # for older InterProScan versions with HTML bug
. " interproscan.sh -appl PrositePatterns,SuperFamily,PfamA,SMART,TIGRFAM -i <fasta file with protein sequences> -f tsv,html -iprlookup\n"
. "\n"
. autoformat(
# "InterProScan HTML output is broken, hence using SVG. However, it's only for graphical (webservice-like) output and optional - you may omit that option.\n"
"\n"
. "These mappings inside the SMIPS' source code are used to assign protein domains and anchor genes to this or that category (change them, if you like):",
{ justify => "full", all => 1 }
)
. "\n"
. "InterPro IPR ID => Anchor gene domain type (%iprs variable):\n"
. Dumper( \%iprs ) . "\n"
. "Anchor gene domain type => Anchor gene type (%domains variable):\n"
. Dumper( \%domains ) . "\n\n"
. "Contact: gianni.panagiotou\@leibniz-hki.de, thomas.wolf\@leibniz-hki.de";
# print README to file
open( my $readme, ">", "README" ) or die "Cannot write to file \"./README\".\n", $!;
say $readme $usage;
close $readme;
# and show it on the screen
if ( which("tty") )
{
if ( system( "tty", "-s" ) == 0 ) # check if stdin is a terminal
{
system( "less", "README" );
}
else
{
system( "cat", "README" );
}
}
else # no tty --> possibly running on windows
{
system("type README");
}
exit;
}
# read arguments from command line
my $help;
my $path_to_interproscan;
GetOptions(
"help|h" => \$help,
"path-to-interproscan|p=s" => \$path_to_interproscan
) or die "[ERROR] Please check your command line.\nStopped";
# if no input file given or asked for help
# --> print help information and domain assignments, and exit
usage() if ( !@ARGV or $help );
# if more than one input file --> print current file
my $print_file = 1 if ( @ARGV > 1 );
# run SMIPS for each given input file
while ( my $input_file = shift )
{
say "\nWorking an \"$input_file\":" if ($print_file);
# STEP 0a - guess input file type
my %input_type;
if ( $input_file =~ m/\.tsv$/ )
{
$input_type{interpro} = 1;
}
elsif ( $input_file =~ m/\.out$/ ) # same like .tsv from InterPro, just different file extension
{
$input_type{interpro} = 1;
}
elsif ( $input_file =~ m/\.tab$/ )
{
$input_type{jgi} = 1;
}
elsif ( $input_file =~ m/\.fasta$/ )
{
$input_type{fasta} = 1;
}
else
{
die
"[ERROR] Sorry, but SMIPS only supports InterProScan files in TSV or OUT format, JGI files in TAB format, or protein sequences in FASTA format.\nStopped";
}
# if input file == protein sequences
if ( $input_type{fasta} )
{
# remove stars from sequences, otherwise InterProScan will complain
say "Removing \'*\' characters from input file, if any. InterProScan doesn't like them.";
remove_stars($input_file);
# die if InterProScan not installed or path unkown
if ( !$path_to_interproscan )
{
if ( !which("interproscan") )
{
die "[ERROR] Please install InterProScan first or tell me its location via --path-to-interproscan.\nStopped";
}
else
{
$path_to_interproscan = which("interproscan");
}
}
# run interproscan on protein fasta input file
say "Running InterProScan on protein sequences file \"$input_file\". This may take several hours. Please be patient …";
say "\n>>>>> InterProScan BEGIN >>>>>";
# system( $path_to_interproscan, "-appl", "PrositePatterns,SuperFamily,PfamA,SMART,TIGRFAM", "-i", $input_file, "-f", "tsv,svg", "-iprlookup" ); # for older InterProScan versions with HTML bug
system( $path_to_interproscan, "-appl", "PrositePatterns,SuperFamily,PfamA,SMART,TIGRFAM", "-i", $input_file, "-f", "tsv,html", "-iprlookup" );
say "<<<<< InterProScan END <<<<<\n";
# point to new interproscan output file
$input_file .= ".tsv";
$input_type{interpro} = 1;
say "InterProScan finished with output file \"$input_file\".";
}
# # STEP 0b - create backup and remove some common unescaped double-quotes
# my $count_substitutions;
# my @lines;
#
# # read original input file
# open( my $original, "<", $input_file ) or die "Cannot read from InterProScan file \"$input_file\".\n", $!;
# while (<$original>)
# {
# $count_substitutions += ( $_ =~ s/"G-D-X-G"/G-D-X-G/ig );
# $count_substitutions += ( $_ =~ s/"G-D-S-L"/G-D-S-L/ig );
# $count_substitutions += ( $_ =~ s/Appr-1"-p/Appr-1-p/ig );
# $count_substitutions += ( $_ =~ s/"FY-rich"/FY-rich/ig );
# $count_substitutions += ( $_ =~ s/"Winged helix"/Winged helix/ig );
# $count_substitutions += ( $_ =~ s/"zincins"/zincins/ig );
# $count_substitutions += ( $_ =~ s/"zinc finger"/zinc finger/ig );
# push( @lines, $_ ); # save changed and unchanged lines to array
# }
# close $original;
#
# if ($count_substitutions)
# {
# # rename original input file
# # write lines to new file with name of the original one
# rename( $input_file, "$input_file.bak" ) or die "Cannot rename \"$input_file\" to \"$input_file.bak\".\n", $!;
# open( my $modified, ">", $input_file ) or die "Cannot write to InterProScan file \"$input_file\".\n", $!;
# print $modified @lines;
# close $modified;
# say "[Warning] Removed $count_substitutions unescaped double quote(s) from input file \"$input_file\"";
# }
# STEP 1 - collect all possible anchor genes
sub all_backbones { }
my %all_backbones;
open( my $in, "<", $input_file ) or die "Cannot read from InterProScan file \"$input_file\".\n", $!;
while ( my $row = $tsv->getline($in) )
{
next if ( "@$row" eq "" or index( $row->[0], "#" ) == 0 ); # comment | emtpy line --> next line
# some checks to recognize faulty formatted input files
if ( $input_type{interpro} )
{
die "Too little/many columns in line "
. $tsv->record_number()
. ". Maybe wrong input file format? SMIPS only supports InterProScan files with 11-15 columns.\nStopped"
if ( @$row < 11 or @$row > 15 );
die "Start of protein domain \"$row->[6]\" in line " . $tsv->record_number() . " isn't numeric. Maybe wrong input file format?.\nStopped"
if ( !looks_like_number( $row->[6] ) );
die "Stop of protein domain \"$row->[7]\" in line " . $tsv->record_number() . " isn't numeric. Maybe wrong input file format?.\nStopped"
if ( !looks_like_number( $row->[7] ) );
}
elsif ( $input_type{jgi} )
{
die "Too little/much columns in line "
. $tsv->record_number()
. ". Maybe wrong input file format? SMIPS only supports JGI files with 10 columns.\nStopped"
if ( @$row != 10 );
die "Start of protein domain \"$row->[7]\" in line " . $tsv->record_number() . " isn't numeric. Maybe wrong input file format?.\nStopped"
if ( $row->[7] !~ m/^[0-9|,|-]+$/ );
die "Stop of protein domain \"$row->[8]\" in line " . $tsv->record_number() . " isn't numeric. Maybe wrong input file format?.\nStopped"
if ( $row->[8] !~ m/^[0-9|,|-]+$/ );
die "No IPR number \"$row->[1]\" in column 2, line " . $tsv->record_number() . ". Maybe wrong input file format?.\nStopped"
if ( $row->[1] !~ m/^IPR\d+/ );
}
# IPR assigned to row
# and IPR corresponds to one of the collected anchor gene domains
if ( $input_type{interpro} and $row->[11] and $iprs{ $row->[11] } )
{
my $id = $row->[0];
my $ipr = $row->[11];
my $domain = $iprs{$ipr};
my $domain_type = $domains{$domain};
my $start = $row->[6];
my $stop = $row->[7];
my $annotation = $row->[12];
die "One of the necessary columns in line " . $tsv->record_number() . " is empty. Maybe wrong input file format?.\nStopped"
if ( any { $_ eq "" } ( $id, $ipr, $start, $stop, $annotation ) );
# gene ID not present yet --> add new ID
if ( !$all_backbones{$id} )
{
$all_backbones{$id} = {
domains => {
$domain => [
{ start => $start, stop => $stop, ipr => { $ipr => 1 }, domain_type => $domain_type, annotation => { $annotation => 1 } }
]
}
};
}
# gene ID present, but not for current domain --> add new domain to present ID
elsif ( !$all_backbones{$id}{domains}{$domain} )
{
$all_backbones{$id}{domains}{$domain} =
[ { start => $start, stop => $stop, ipr => { $ipr => 1 }, domain_type => $domain_type, annotation => { $annotation => 1 } } ];
}
# gene ID and domain already present --> add another instance of that domain
else
{
push(
@{ $all_backbones{$id}{domains}{$domain} },
{ start => $start, stop => $stop, ipr => { $ipr => 1 }, domain_type => $domain_type, annotation => { $annotation => 1 } }
);
}
}
elsif ( ( $input_type{jgi} and $row->[1] and $iprs{ $row->[1] } ) )
{
my $id = $row->[0];
my $ipr = $row->[1];
my $domain = $iprs{$ipr};
my $domain_type = $domains{$domain};
my $start = $row->[7];
my $stop = $row->[8];
my $annotation = $row->[2];
die "One of the necessary columns in line " . $tsv->record_number() . " is empty. Maybe wrong input file format?.\nStopped"
if ( any { $_ eq "" } ( $id, $ipr, $start, $stop, $annotation ) );
# remove trailing "," if present
$start =~ s/,$//;
$stop =~ s/,$//;
my @starts = split( ",", $start );
my @stops = split( ",", $stop );
die "More/less domain start than stop positions in line " . $tsv->record_number() . ".\nStopped" if ( @starts != @stops );
for ( 0 .. $#starts )
{
my $start = $starts[$_];
my $stop = $stops[$_];
# possible for JGI files --> don't know how to treat --> skip
if ( $start < 0 )
{
warn "[WARNING] Start position \"$start\" in line " . $tsv->record_number(), " is negative. Skipping domain.\n";
next;
}
elsif ( $stop < 0 )
{
warn "[WARNING] Stop position \"$stop\" in line " . $tsv->record_number() . " is negative. Skipping domain.\n";
next;
}
# gene ID not present yet --> add new ID
if ( !$all_backbones{$id} )
{
$all_backbones{$id} = {
domains => {
$domain => [
{
start => $start,
stop => $stop,
ipr => { $ipr => 1 },
domain_type => $domain_type,
annotation => { $annotation => 1 }
}
]
}
};
}
# gene ID present, but not for current domain --> add new domain to present ID
elsif ( !$all_backbones{$id}{domains}{$domain} )
{
$all_backbones{$id}{domains}{$domain} =
[ { start => $start, stop => $stop, ipr => { $ipr => 1 }, domain_type => $domain_type, annotation => { $annotation => 1 } } ];
}
# gene ID and domain already present --> add another instance of that domain
else
{
push(
@{ $all_backbones{$id}{domains}{$domain} },
{ start => $start, stop => $stop, ipr => { $ipr => 1 }, domain_type => $domain_type, annotation => { $annotation => 1 } }
);
}
}
}
}
# if we didn't reach the end of the file, something went wrong
# --> let's see what happened
if ( !$tsv->eof )
{
$tsv->error_diag;
die $tsv->error_input;
}
close $in;
# print all anchor domains, found in InterProScan output, to file
open( my $out_all_domains, ">", $input_file . ".anchor_gene_domains.csv" )
or die "Cannot write to csv output file \"" . $input_file . ".anchor_gene_domains.csv" . "\".\n", $!;
$tsv->print( $out_all_domains, [ "# ID", "Domain", "Start", "Stop", "IPR", "Type", "Annotation" ] ) or die $tsv->error_diag();
for my $id ( nsort( keys %all_backbones ) )
{
for my $domain ( sort keys $all_backbones{$id}{domains} )
{
for my $domain_occurrence ( @{ $all_backbones{$id}{domains}{$domain} } )
{
# only one key for ipr and domain_type at the moment
$tsv->print(
$out_all_domains,
[
$id, $domain, $domain_occurrence->{start},
$domain_occurrence->{stop}, keys $domain_occurrence->{ipr}, $domain_occurrence->{domain_type},
keys $domain_occurrence->{annotation}
]
) or die $tsv->error_diag();
}
}
}
close $out_all_domains;
# STEP 2 - select all anchor genes with a minimal set of core domains
# --> check for existence of core domains
# --> determine anchor type
# --> count number of their occurrences
sub minimal_backbones { }
my %minimal_backbones;
my %counts;
for ( keys %all_backbones )
{
my @domains = keys $all_backbones{$_}{domains}; # collect all domains present in the current anchor gene
# pure PKS
if ( exists $all_backbones{$_}{domains}{KS}
and exists $all_backbones{$_}{domains}{AT}
and exists $all_backbones{$_}{domains}{PP} )
{
# or NRPS-PKS hybrid
if ( exists $all_backbones{$_}{domains}{A}
or exists $all_backbones{$_}{domains}{C} )
{
$all_backbones{$_}{backbone_type} = "NRPS-PKS hybrid";
}
else
{
$all_backbones{$_}{backbone_type} = "PKS";
}
}
# pure NRPS
elsif ( exists $all_backbones{$_}{domains}{A}
and exists $all_backbones{$_}{domains}{PP}
and exists $all_backbones{$_}{domains}{C} )
{
# or NRPS-PKS hybrid
if ( exists $all_backbones{$_}{domains}{KS}
or exists $all_backbones{$_}{domains}{AT}
or exists $all_backbones{$_}{domains}{Cyc}
or exists $all_backbones{$_}{domains}{KR}
or exists $all_backbones{$_}{domains}{DH}
or exists $all_backbones{$_}{domains}{ER}
# or exists $all_backbones{$_}{domains}{MT}
)
{
$all_backbones{$_}{backbone_type} = "NRPS-PKS hybrid";
}
else
{
$all_backbones{$_}{backbone_type} = "NRPS";
}
}
# KS-only
elsif ( exists $all_backbones{$_}{domains}{KS} )
{
if ( any { $_ ne "KS" } @domains )
{
# or NRPS-PKS hybrid-like
if ( exists $all_backbones{$_}{domains}{A}
or exists $all_backbones{$_}{domains}{C} )
{
$all_backbones{$_}{backbone_type} = "NRPS-PKS hybrid-like";
}
# or PKS-like?
else
{
$all_backbones{$_}{backbone_type} = "PKS-like";
}
}
else
{
$all_backbones{$_}{backbone_type} = "KS-only";
}
}
# C-only
elsif ( exists $all_backbones{$_}{domains}{C} )
{
if ( any { $_ ne "C" } @domains )
{
# or NRPS-PKS hybrid-like
if ( exists $all_backbones{$_}{domains}{KS}
or exists $all_backbones{$_}{domains}{AT}
or exists $all_backbones{$_}{domains}{Cyc}
or exists $all_backbones{$_}{domains}{KR}
or exists $all_backbones{$_}{domains}{DH}
or exists $all_backbones{$_}{domains}{ER}
# or exists $all_backbones{$_}{domains}{MT}
)
{
$all_backbones{$_}{backbone_type} = "NRPS-PKS hybrid-like";
}
# or NRPS-like
else
{
$all_backbones{$_}{backbone_type} = "NRPS-like";
}
}
else
{
$all_backbones{$_}{backbone_type} = "C-only";
}
}
# NRPS/PKS-like … (* similar to definition by SMURF) ################################################
elsif ( @domains >= 2 )
{
# NRPS-PKS hybrid-like* --> at one PKS and one NRPS domain
# if ( ( any { /^(KS|AT|Cyc|KR|DH|ER|MT)$/ } @domains ) and ( any { /^(A|C)$/ } @domains ) )
if ( ( any { /^(KS|AT|Cyc|KR|DH|ER)$/ } @domains ) and ( any { /^(A|C)$/ } @domains ) )
{
$all_backbones{$_}{backbone_type} = "NRPS-PKS hybrid-like*";
}
# PKS-like --> at least two PKS domains
# elsif ( ( grep { /^(KS|AT|Cyc|KR|DH|ER|MT|PP)$/ } @domains ) >= 2 )
elsif ( ( grep { /^(KS|AT|Cyc|KR|DH|ER|PP)$/ } @domains ) >= 2 )
{
$all_backbones{$_}{backbone_type} = "PKS-like*";
}
# NRPS-like --> at least two NRPS domains
elsif ( ( grep { /^(A|C|PP)$/ } @domains ) >= 2 )
{
$all_backbones{$_}{backbone_type} = "NRPS-like*";
}
}
#####################################################################################################
# trans-AT (exactly one AT domain which might belong to a NRPS that lacks the A domain)
elsif ( @domains == 1 and $domains[0] eq "AT" )
{
$all_backbones{$_}{backbone_type} = "trans-AT";
}
# pure DMATS
elsif ( exists $all_backbones{$_}{domains}{Prenyltransferase} )
{
# or DMATS-NRPS-PKS hybrid
if (
(
exists $all_backbones{$_}{domains}{KS}
or exists $all_backbones{$_}{domains}{AT}
or exists $all_backbones{$_}{domains}{Cyc}
or exists $all_backbones{$_}{domains}{KR}
or exists $all_backbones{$_}{domains}{DH}
or exists $all_backbones{$_}{domains}{ER}
# or exists $all_backbones{$_}{domains}{MT}
)
and ( exists $all_backbones{$_}{domains}{A}
or exists $all_backbones{$_}{domains}{C} )
)
{
$all_backbones{$_}{backbone_type} = "DMATS-NRPS-PKS hybrid";
}
# or DMATS-PKS hybrid
elsif (exists $all_backbones{$_}{domains}{KS}
or exists $all_backbones{$_}{domains}{AT}
or exists $all_backbones{$_}{domains}{Cyc}
or exists $all_backbones{$_}{domains}{KR}
or exists $all_backbones{$_}{domains}{DH}
or exists $all_backbones{$_}{domains}{ER}
# or exists $all_backbones{$_}{domains}{MT}
)
{
$all_backbones{$_}{backbone_type} = "DMATS-PKS hybrid";
}
# or DMATS-NRPS hybrid
elsif (exists $all_backbones{$_}{domains}{A}
or exists $all_backbones{$_}{domains}{C} )
{
$all_backbones{$_}{backbone_type} = "DMATS-NRPS hybrid";
}
else
{
$all_backbones{$_}{backbone_type} = "DMATS";
}
}
# current gene got an anchor type --> minimal set of core domains present
if ( $all_backbones{$_}{backbone_type} )
{
$counts{ $all_backbones{$_}{backbone_type} }++;
$minimal_backbones{$_} = $all_backbones{$_};
}
}
# STEP 3 - sort and merge domains of "true" anchor genes
# --> sort domains by position
# --> merge domains which belong together
sub sort_and_merge { }
my %minimal_backbones_sorted;
for my $id ( keys %minimal_backbones )
{
# make all instances of all domains of a gene sortable by their start and stop positions
# and sort them by position, huh
my @domains;
for my $domain ( keys $minimal_backbones{$id}{domains} )
{
push( @domains,
map { { start => $_->{start}, stop => $_->{stop}, name => $domain, content => $_ } } @{ $minimal_backbones{$id}{domains}{$domain} } );
}
@domains =
map { { name => $_->{name}, %{ $_->{content} } } } sort { $a->{start} <=> $b->{start} || $a->{stop} <=> $b->{stop} } @domains;
# set anchor type and prepare domains array
$minimal_backbones_sorted{$id} = { backbone_type => $minimal_backbones{$id}{backbone_type}, domains => [] };
# merge all domains $j=2…n, which are of the same type as domain $i=1…n-1, into domain $i
for ( my $i = 0 ; $i <= $#domains ; $i++ )
{
my $jump = $i;
for ( my $j = $i + 1 ; $j <= $#domains ; $j++ )
{
# overlapping domains of same type
# or adjecent domains of same type with completely different IPR(s)?
# --> merge both into first domain
if ( $domains[$i]{name} eq $domains[$j]{name}
and ( $domains[$i]{stop} >= $domains[$j]{start} or !any { $domains[$j]{ipr}{$_} } keys $domains[$i]{ipr} ) )
{
my %merged_iprs = ( %{ $domains[$i]{ipr} }, %{ $domains[$j]{ipr} } );
my %merged_annotations = ( %{ $domains[$i]{annotation} }, %{ $domains[$j]{annotation} } );
my $start;
my $stop;
( $domains[$i]{start} < $domains[$j]{start} ) ? ( $start = $domains[$i]{start} ) : ( $start = $domains[$j]{start} );
( $domains[$i]{stop} > $domains[$j]{stop} ) ? ( $stop = $domains[$i]{stop} ) : ( $stop = $domains[$j]{stop} );
$domains[$i] = {
name => $domains[$i]{name},
ipr => \%merged_iprs,
start => $start,
stop => $stop,
domain_type => $domains[$i]{domain_type},
annotation => \%merged_annotations
};
$jump = $j;
}
else
{
$jump = $j - 1;
last; # end inner for loop if adjacent domains are not of same type (anymore)
}
}
push( @{ $minimal_backbones_sorted{$id}{domains} }, $domains[$i] ); # save type and content of (maybe merged) domain
$i = $jump; # next $i will be current $j ($jump=$j-1) OR one position in advance of current $j ($jump=$j)
}
}
# STEP 4 - print results file
# --> basically, content of %minimal_backbones in csv format
sub print { }
open( my $out_backbones, ">", $input_file . ".anchor_genes.csv" )
or die "Cannot write to csv output file \"" . $input_file . ".anchor_genes.csv" . "\".\n", $!;
# MS Excel is just stupid!!!
# (1) Either uncomment to following line, to tell excel which column separator to use
# (but this will add a non-comment otherwise senseless first line to the output file)
# OR (2) change the file extension to ".txt".
# Without those cheats, excel doesn't recognize the tabs and will show each line in a single column.
# While applying one of those cheats, Excel will "see" now (why? nothing changed!) the tabs.
#
# say $out "sep=\t";
# first, some statistics and explanations
if ( any { /NRPS|PKS|DMATS/ and !/like/ } keys %counts )
{
say $out_backbones "## NRPS, PKS, and DMATS genes:";
say $out_backbones "# $_ $counts{$_}" for ( sort grep { /NRPS|PKS|DMATS/ and !/like/ } keys %counts );
say $out_backbones "#";
}
if ( any { /NRPS|PKS/ and /like/ and !/hybrid/ } keys %counts )
{
say $out_backbones "## NRPS- and PKS-like genes:";
say $out_backbones "# $_ $counts{$_}" for ( sort grep { /NRPS|PKS/ and /like/ and !/hybrid/ } keys %counts );
say $out_backbones "#";
}
if ( any { /only|trans/ } keys %counts )
{
say $out_backbones "## Single domains (genes where only AT, C, or KS are reported):";
say $out_backbones "# $_ $counts{$_}" for ( sort grep { /only|trans/ } keys %counts );
say $out_backbones "#";
}
say $out_backbones "# -only: incomplete anchor genes, with only KS or C domain(s)" if ( any { /-only$/ } keys %counts );
say $out_backbones "# -like: incomplete anchor genes, with at least one KS and/or C domain" if ( any { /-like$/ } keys %counts );
say $out_backbones
"# -like*: incomplete anchor genes, with (at least) two typical PKS and/or NRPS domains (similar to definition by the SMURF tool)"
if ( any { /-like\*$/ } keys %counts );
say $out_backbones "# trans-AT: possible trans-AT domain, which is used by an somewhere else located PKS (which is lacking the AT domain)"
if ( any { /^trans-AT$/ } keys %counts );
say $out_backbones "#";
say $out_backbones "# A: Adenylation, AMP-dependent synthetase/ligase"
if ( any { /^A$/ } map { keys $_->{domains} } values %minimal_backbones );
say $out_backbones "# AT: Acyl transferase" if ( any { /^AT$/ } map { keys $_->{domains} } values %minimal_backbones );
say $out_backbones "# C: Condensation" if ( any { /^C$/ } map { keys $_->{domains} } values %minimal_backbones );
say $out_backbones "# Cyc: Cyclase" if ( any { /^Cyc$/ } map { keys $_->{domains} } values %minimal_backbones );
say $out_backbones "# DH: Dehydratase" if ( any { /^DH$/ } map { keys $_->{domains} } values %minimal_backbones );
say $out_backbones "# ER: Enoylreductase, GroES-like, Alcohol dehydrogenase"
if ( any { /^ER$/ } map { keys $_->{domains} } values %minimal_backbones );
say $out_backbones "# KR: Keto reductase" if ( any { /^KR$/ } map { keys $_->{domains} } values %minimal_backbones );
say $out_backbones "# KS: Beta-ketoacyl synthase" if ( any { /^KS$/ } map { keys $_->{domains} } values %minimal_backbones );
say $out_backbones "# MT: Methyltransferase" if ( any { /^MT$/ } map { keys $_->{domains} } values %minimal_backbones );
say $out_backbones "# PP: Phosphopantetheine" if ( any { /^PP$/ } map { keys $_->{domains} } values %minimal_backbones );
say $out_backbones "# TE: Thioesterase, Thioester reductase-like" if ( any { /^TE$/ } map { keys $_->{domains} } values %minimal_backbones );
say $out_backbones "# xNRPS: typical NRPS domain with yet unkown function"
if ( any { /^xNRPS$/ } map { keys $_->{domains} } values %minimal_backbones );
say $out_backbones "# xPKS: typical PKS domain with yet unkown function"
if ( any { /^xPKS$/ } map { keys $_->{domains} } values %minimal_backbones );
say $out_backbones "#";
say $out_backbones "# \'X-Y\': depiction of distinct domains";
say $out_backbones "# \'X:Y\': depiction of overlapping domains";
say $out_backbones "#";
# next, the table head
say $out_backbones "## " . join( "\t", "Gene ID", "Anchor gene type", "Domain arrangement", "Domains" );
# finally, the content of %minimal_backbones
# NATURALLY sorted by the gene ID (or protein ID or whatever …)
for my $id ( nsort( keys %minimal_backbones_sorted ) )
{
my @domain_details;
my $domain_arrangement;
my $backbone_type;
# stringify the domain details
for ( @{ $minimal_backbones_sorted{$id}{domains} } )
{
push(
@domain_details,
join( ", ",
"name=$_->{name}", "type=$_->{domain_type}",
"start=$_->{start}", "stop=$_->{stop}",
"ipr=" . join( "|", sort keys $_->{ipr} ), "annotation=" . join( " | ", sort keys $_->{annotation} ) )
);
}
# gene/protein has more than one domain?
if ( @{ $minimal_backbones_sorted{$id}{domains} } > 1 )
{
for ( 0 .. $#{ $minimal_backbones_sorted{$id}{domains} } - 1 )
{
my $delimiter = "-";
# overlapping domains of different types?
# WARNING: domains overlapping with more than one other domain will not be recognized and marked with ":"
# --> only the first overlap will have a proper representation in the output
if ( $minimal_backbones_sorted{$id}{domains}[$_]{stop} >= $minimal_backbones_sorted{$id}{domains}[ $_ + 1 ]{start} )
{
warn "[WARNING] Overlapping domains in protein \"$id\": "
. "<$minimal_backbones_sorted{$id}{domains}[$_]{start}"
. " [$minimal_backbones_sorted{$id}{domains}[$_]{name}] "
. "$minimal_backbones_sorted{$id}{domains}[$_]{stop}>" . " and "
. "<$minimal_backbones_sorted{$id}{domains}[$_+1]{start}"
. " [$minimal_backbones_sorted{$id}{domains}[$_+1]{name}] "
. "$minimal_backbones_sorted{$id}{domains}[$_+1]{stop}>\n";
$delimiter = ":";
}
$domain_arrangement .= $minimal_backbones_sorted{$id}{domains}[$_]{name} . $delimiter;
}
$domain_arrangement .= $minimal_backbones_sorted{$id}{domains}[-1]{name};
}
# gene/protein has only one domain? --> quite easy domain "arrangement"
else
{
$domain_arrangement = $minimal_backbones_sorted{$id}{domains}[0]{name};
}
# the anchor type is already there
$backbone_type = $minimal_backbones_sorted{$id}{backbone_type};
# print line to file
$tsv->print( $out_backbones, [ $id, $backbone_type, $domain_arrangement, @domain_details ] ) or die $tsv->error_diag();
}
close $out_backbones;
# also print short results to console
if ( any { /NRPS|PKS|DMATS/ and !/like/ } keys %counts )
{
say "NRPS, PKS, and DMATS genes:";
say "$_ $counts{$_}" for ( sort grep { /NRPS|PKS|DMATS/ and !/like/ } keys %counts );
}
if ( any { /NRPS|PKS/ and /like/ and !/hybrid/ } keys %counts )
{
say "NRPS- and PKS-like genes:";
say "$_ $counts{$_}" for ( sort grep { /NRPS|PKS/ and /like/ and !/hybrid/ } keys %counts );
}
if ( any { /only|trans/ } keys %counts )
{
say "Single domains (genes where only AT, C, or KS are reported):";
say "$_ $counts{$_}" for ( sort grep { /only|trans/ } keys %counts );
}
}
sub remove_stars
{
my $infile = shift;
my @temp_seq; # keep (changed) sequences in memory before writing to output
my $in = Bio::SeqIO->new( -file => "<$infile" );
while ( my $seq_obj = $in->next_seq() )
{
my $seq = $seq_obj->seq();
my $desc = $seq_obj->desc();
# check if protein input file actually contains protein sequences only
die "[ERROR] Sequence \"$desc\" is not a protein sequence. Please check your input file.\nStopped" if ( $seq_obj->alphabet() ne "protein" );
# remove all asterisks (if any)
$seq =~ s/\*//g;
$seq_obj->seq($seq);
$seq_obj->desc($desc);
push( @temp_seq, $seq_obj );
}
my $out = Bio::SeqIO->new( -file => ">$infile" ); # overwrite infile
$out->write_seq($_) for @temp_seq;
}