diff --git a/LICENSE b/LICENSE index 343eb7cf..7259957a 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ BSD 3-Clause License -Copyright (c) 2020, macs3 project team All rights reserved. +Copyright (c) 2024, macs3 project team All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: diff --git a/MACS3/Commands/bdgbroadcall_cmd.py b/MACS3/Commands/bdgbroadcall_cmd.py index ba240489..df3f41e4 100644 --- a/MACS3/Commands/bdgbroadcall_cmd.py +++ b/MACS3/Commands/bdgbroadcall_cmd.py @@ -1,4 +1,4 @@ -# Time-stamp: <2024-05-15 10:41:43 Tao Liu> +# Time-stamp: <2024-10-02 15:55:43 Tao Liu> """Description: Fine-tuning script to call broad peaks from a single bedGraph track for scores. @@ -13,7 +13,6 @@ # python modules # ------------------------------------ -import sys import os from MACS3.IO import BedGraphIO # ------------------------------------ @@ -23,14 +22,14 @@ # ------------------------------------ # Misc functions # ------------------------------------ -import logging -import MACS3.Utilities.Logger +from MACS3.Utilities.Logger import logging logger = logging.getLogger(__name__) -debug = logger.debug -info = logger.info -error = logger.critical -warn = logger.warning +debug = logger.debug +info = logger.info +error = logger.critical +warn = logger.warning + # ------------------------------------ # Classes # ------------------------------------ @@ -38,21 +37,23 @@ # ------------------------------------ # Main function # ------------------------------------ -def run( options ): + + +def run(options): info("Read and build bedGraph...") bio = BedGraphIO.bedGraphIO(options.ifile) btrack = bio.read_bedGraph(baseline_value=0) info("Call peaks from bedGraph...") - bpeaks = btrack.call_broadpeaks (options.cutoffpeak, options.cutofflink, options.minlen, options.lvl1maxgap, options.lvl2maxgap) + bpeaks = btrack.call_broadpeaks(options.cutoffpeak, options.cutofflink, options.minlen, options.lvl1maxgap, options.lvl2maxgap) info("Write peaks...") if options.ofile: - bf = open( os.path.join( options.outdir, options.ofile ), "w" ) + bf = open(os.path.join(options.outdir, options.ofile), "w") options.oprefix = options.ofile else: - bf = open ( os.path.join( options.outdir, "%s_c%.1f_C%.2f_l%d_g%d_G%d_broad.bed12" % (options.oprefix,options.cutoffpeak,options.cutofflink,options.minlen,options.lvl1maxgap,options.lvl2maxgap)), "w" ) + bf = open(os.path.join(options.outdir, "%s_c%.1f_C%.2f_l%d_g%d_G%d_broad.bed12" % (options.oprefix,options.cutoffpeak, options.cutofflink, options.minlen, options.lvl1maxgap, options.lvl2maxgap)), "w") bpeaks.write_to_gappedPeak(bf, name_prefix=(options.oprefix+"_broadRegion").encode(), score_column="score", trackline=options.trackline) info("Done") diff --git a/MACS3/Commands/bdgcmp_cmd.py b/MACS3/Commands/bdgcmp_cmd.py index 655b8cf2..7479af57 100644 --- a/MACS3/Commands/bdgcmp_cmd.py +++ b/MACS3/Commands/bdgcmp_cmd.py @@ -1,4 +1,4 @@ -# Time-stamp: <2024-05-15 10:13:23 Tao Liu> +# Time-stamp: <2024-10-02 16:06:33 Tao Liu> """Description: compare bdg files @@ -7,14 +7,10 @@ the distribution). """ -import sys import os - from MACS3.IO import BedGraphIO from MACS3.Utilities.OptValidator import opt_validate_bdgcmp -from math import log as mlog - # ------------------------------------ # constants # ------------------------------------ @@ -27,15 +23,19 @@ # Main function # ------------------------------------ -def run( options ): - options = opt_validate_bdgcmp( options ) + +def run(options): + options = opt_validate_bdgcmp(options) info = options.info - warn = options.warn - debug = options.debug - error = options.error - + # warn = options.warn + # debug = options.debug + # error = options.error + scaling_factor = options.sfactor - pseudo_depth = 1.0/scaling_factor # not an actual depth, but its reciprocal, a trick to override SPMR while necessary. + + # not an actual depth, but its reciprocal, a trick to override + # SPMR while necessary. + pseudo_depth = 1.0/scaling_factor info("Read and build treatment bedGraph...") tbio = BedGraphIO.bedGraphIO(options.tfile) @@ -46,47 +46,47 @@ def run( options ): cbtrack = cbio.read_bedGraph() info("Build ScoreTrackII...") - sbtrack = tbtrack.make_ScoreTrackII_for_macs( cbtrack, depth1 = pseudo_depth, depth2 = pseudo_depth ) + sbtrack = tbtrack.make_ScoreTrackII_for_macs(cbtrack, depth1=pseudo_depth, depth2=pseudo_depth) if abs(scaling_factor-1) > 1e-6: # Only for the case while your input is SPMR from MACS3 callpeak; Let's override SPMR. info("Values in your input bedGraph files will be multiplied by %f ..." % scaling_factor) - sbtrack.change_normalization_method( ord('M') ) # a hack to override SPMR - sbtrack.set_pseudocount( options.pseudocount ) + sbtrack.change_normalization_method(ord('M')) # a hack to override SPMR + sbtrack.set_pseudocount(options.pseudocount) already_processed_method_list = [] for (i, method) in enumerate(options.method): if method in already_processed_method_list: continue else: - already_processed_method_list.append( method ) + already_processed_method_list.append(method) info("Calculate scores comparing treatment and control by '%s'..." % method) if options.ofile: - ofile = os.path.join( options.outdir, options.ofile[ i ] ) + ofile = os.path.join(options.outdir, options.ofile[i]) else: - ofile = os.path.join( options.outdir, options.oprefix + "_" + method + ".bdg" ) + ofile = os.path.join(options.outdir, options.oprefix + "_" + method + ".bdg") # build score track if method == 'ppois': - sbtrack.change_score_method( ord('p') ) + sbtrack.change_score_method(ord('p')) elif method == 'qpois': - sbtrack.change_score_method( ord('q') ) + sbtrack.change_score_method(ord('q')) elif method == 'subtract': - sbtrack.change_score_method( ord('d') ) + sbtrack.change_score_method(ord('d')) elif method == 'logFE': - sbtrack.change_score_method( ord('f') ) + sbtrack.change_score_method(ord('f')) elif method == 'FE': - sbtrack.change_score_method( ord('F') ) + sbtrack.change_score_method(ord('F')) elif method == 'logLR': # log likelihood - sbtrack.change_score_method( ord('l') ) + sbtrack.change_score_method(ord('l')) elif method == 'slogLR': # log likelihood - sbtrack.change_score_method( ord('s') ) + sbtrack.change_score_method(ord('s')) elif method == 'max': - sbtrack.change_score_method( ord('M') ) + sbtrack.change_score_method(ord('M')) else: raise Exception("Can't reach here!") info("Write bedGraph of scores...") - ofhd = open( ofile, "w" ) + ofhd = open(ofile, "w") # write_bedGraph function for ScoreTrack - sbtrack.write_bedGraph(ofhd,name="%s_Scores" % (method.upper()),description="Scores calculated by %s" % (method.upper()), column = 3) + sbtrack.write_bedGraph(ofhd, name="%s_Scores" % (method.upper()), description="Scores calculated by %s" % (method.upper()), column=3) info("Finished '%s'! Please check '%s'!" % (method, ofile)) diff --git a/MACS3/Commands/bdgdiff_cmd.py b/MACS3/Commands/bdgdiff_cmd.py index 148b476e..ecb17b76 100644 --- a/MACS3/Commands/bdgdiff_cmd.py +++ b/MACS3/Commands/bdgdiff_cmd.py @@ -1,4 +1,4 @@ -# Time-stamp: <2024-05-15 10:42:27 Tao Liu> +# Time-stamp: <2024-10-02 16:11:19 Tao Liu> """Description: Naive call differential peaks from 4 bedGraph tracks for scores. @@ -11,7 +11,6 @@ # python modules # ------------------------------------ -import sys import os from MACS3.IO import BedGraphIO from MACS3.Signal import ScoreTrack @@ -23,14 +22,13 @@ # ------------------------------------ # Misc functions # ------------------------------------ -import logging -import MACS3.Utilities.Logger +from MACS3.Utilities.Logger import logging logger = logging.getLogger(__name__) -debug = logger.debug -info = logger.info -error = logger.critical -warn = logger.warning +debug = logger.debug +info = logger.info +error = logger.critical +warn = logger.warning # ------------------------------------ # Classes # ------------------------------------ @@ -38,13 +36,12 @@ # ------------------------------------ # Main function # ------------------------------------ -def run( options ): + + +def run(options): if options.maxgap >= options.minlen: error("MAXGAP should be smaller than MINLEN! Your input is MAXGAP = %d and MINLEN = %d" % (options.maxgap, options.minlen)) - LLR_cutoff = options.cutoff - ofile_prefix = options.oprefix - info("Read and build treatment 1 bedGraph...") t1bio = BedGraphIO.bedGraphIO(options.t1bdg) t1btrack = t1bio.read_bedGraph() @@ -68,44 +65,43 @@ def run( options ): depth1 = depth2 / depth1 depth2 = 1.0 elif depth1 < depth2: # scale down condition 2 to size of condition 1 - depth2 = depth1/ depth2 + depth2 = depth1 / depth2 depth1 = 1.0 else: # no need to scale down any depth1 = 1.0 depth2 = 1.0 - twoconditionscore = ScoreTrack.TwoConditionScores( t1btrack, - c1btrack, - t2btrack, - c2btrack, - depth1, - depth2 ) + twoconditionscore = ScoreTrack.TwoConditionScores(t1btrack, + c1btrack, + t2btrack, + c2btrack, + depth1, + depth2) twoconditionscore.build() twoconditionscore.finalize() - (cat1,cat2,cat3) = twoconditionscore.call_peaks(min_length=options.minlen, max_gap=options.maxgap, cutoff=options.cutoff) + (cat1, cat2, cat3) = twoconditionscore.call_peaks(min_length=options.minlen, max_gap=options.maxgap, cutoff=options.cutoff) info("Write peaks...") if options.ofile: - ofiles = [os.path.join( options.outdir, x ) for x in options.ofile] - name_prefix = [ x.encode() for x in options.ofile ] + ofiles = [os.path.join(options.outdir, x) for x in options.ofile] + name_prefix = [x.encode() for x in options.ofile] else: - ofiles = [ os.path.join( options.outdir, "%s_c%.1f_cond1.bed" % (options.oprefix,options.cutoff)), - os.path.join( options.outdir, "%s_c%.1f_cond2.bed" % (options.oprefix,options.cutoff)), - os.path.join( options.outdir, "%s_c%.1f_common.bed" % (options.oprefix,options.cutoff)) - ] - name_prefix = [ x.encode() for x in [ options.oprefix+"_cond1_", options.oprefix+"_cond2_", options.oprefix+"_common_" ]] - - nf = open( ofiles[ 0 ], 'w' ) - cat1.write_to_bed(nf, name_prefix=name_prefix[ 0 ], name=b"condition 1", description=b"unique regions in condition 1", score_column="score") + ofiles = [os.path.join(options.outdir, "%s_c%.1f_cond1.bed" % (options.oprefix, options.cutoff)), + os.path.join(options.outdir, "%s_c%.1f_cond2.bed" % (options.oprefix, options.cutoff)), + os.path.join(options.outdir, "%s_c%.1f_common.bed" % (options.oprefix, options.cutoff)) + ] + name_prefix = [x.encode() for x in [options.oprefix+"_cond1_", options.oprefix+"_cond2_", options.oprefix+"_common_"]] + + nf = open(ofiles[0], 'w') + cat1.write_to_bed(nf, name_prefix=name_prefix[0], name=b"condition 1", description=b"unique regions in condition 1", score_column="score") nf.close() - nf = open( ofiles[ 1 ], 'w' ) - cat2.write_to_bed(nf, name_prefix=name_prefix[ 1 ], name=b"condition 2", description=b"unique regions in condition 2", score_column="score") + nf = open(ofiles[1], 'w') + cat2.write_to_bed(nf, name_prefix=name_prefix[1], name=b"condition 2", description=b"unique regions in condition 2", score_column="score") nf.close() - nf = open( ofiles[ 2 ], 'w' ) - cat3.write_to_bed(nf, name_prefix=name_prefix[ 2 ], name=b"common", description=b"common regions in both conditions", score_column="score") + nf = open(ofiles[2], 'w') + cat3.write_to_bed(nf, name_prefix=name_prefix[2], name=b"common", description=b"common regions in both conditions", score_column="score") nf.close() info("Done") - diff --git a/MACS3/Commands/bdgopt_cmd.py b/MACS3/Commands/bdgopt_cmd.py index 8d43a44c..3a18fd1a 100644 --- a/MACS3/Commands/bdgopt_cmd.py +++ b/MACS3/Commands/bdgopt_cmd.py @@ -1,4 +1,4 @@ -# Time-stamp: <2024-05-15 11:15:48 Tao Liu> +# Time-stamp: <2024-10-02 16:12:50 Tao Liu> """Description: Modify bedGraph file @@ -10,7 +10,6 @@ # ------------------------------------ # python modules # ------------------------------------ -import sys import os from MACS3.IO import BedGraphIO @@ -31,13 +30,15 @@ # ------------------------------------ # Main function # ------------------------------------ -def run( options ): - options = opt_validate_bdgopt( options ) + + +def run(options): + options = opt_validate_bdgopt(options) info = options.info - warn = options.warn - debug = options.debug - error = options.error - + # warn = options.warn + # debug = options.debug + # error = options.error + info("Read and build bedGraph...") bio = BedGraphIO.bedGraphIO(options.ifile) btrack = bio.read_bedGraph(baseline_value=0) @@ -49,18 +50,15 @@ def run( options ): else: extraparam = float(options.extraparam[0]) if options.method.lower() == "multiply": - btrack.apply_func( lambda x: x * extraparam) + btrack.apply_func(lambda x: x * extraparam) elif options.method.lower() == "add": - btrack.apply_func( lambda x: x + extraparam) + btrack.apply_func(lambda x: x + extraparam) elif options.method.lower() == "max": - btrack.apply_func( lambda x: x if x> extraparam else extraparam ) + btrack.apply_func(lambda x: x if x > extraparam else extraparam) elif options.method.lower() == "min": - btrack.apply_func( lambda x: x if x< extraparam else extraparam ) + btrack.apply_func(lambda x: x if x < extraparam else extraparam) - ofile = BedGraphIO.bedGraphIO( os.path.join( options.outdir, options.ofile ), data = btrack ) + ofile = BedGraphIO.bedGraphIO(os.path.join(options.outdir, options.ofile), data=btrack) info("Write bedGraph of modified scores...") - ofile.write_bedGraph(name="%s_modified_scores" % (options.method.upper()),description="Scores calculated by %s" % (options.method.upper())) + ofile.write_bedGraph(name="%s_modified_scores" % (options.method.upper()), description="Scores calculated by %s" % (options.method.upper())) info("Finished '%s'! Please check '%s'!" % (options.method, ofile.bedGraph_filename)) - - - diff --git a/MACS3/Commands/bdgpeakcall_cmd.py b/MACS3/Commands/bdgpeakcall_cmd.py index 2240e987..ab4ff563 100644 --- a/MACS3/Commands/bdgpeakcall_cmd.py +++ b/MACS3/Commands/bdgpeakcall_cmd.py @@ -1,4 +1,4 @@ -# Time-stamp: <2024-05-15 10:43:03 Tao Liu> +# Time-stamp: <2024-10-02 16:16:49 Tao Liu> """Description: Naive call peaks from a single bedGraph track for scores. @@ -11,7 +11,6 @@ # ------------------------------------ # python modules # ------------------------------------ -import sys import os from MACS3.IO import BedGraphIO # ------------------------------------ @@ -21,14 +20,13 @@ # ------------------------------------ # Misc functions # ------------------------------------ -import logging -import MACS3.Utilities.Logger +from MACS3.Utilities.Logger import logging logger = logging.getLogger(__name__) -debug = logger.debug -info = logger.info -error = logger.critical -warn = logger.warning +debug = logger.debug +info = logger.info +error = logger.critical +warn = logger.warning # ------------------------------------ # Classes # ------------------------------------ @@ -36,33 +34,32 @@ # ------------------------------------ # Main function # ------------------------------------ -def run( options ): + + +def run(options): info("Read and build bedGraph...") bio = BedGraphIO.bedGraphIO(options.ifile) btrack = bio.read_bedGraph(baseline_value=0) if options.cutoff_analysis: info("Analyze cutoff vs number of peaks/total length of peaks/average length of peak") - cutoff_analysis_result = btrack.cutoff_analysis( int(options.maxgap), int(options.minlen), min_score = btrack.minvalue, max_score = int(options.cutoff_analysis_max), steps = int(options.cutoff_analysis_steps) ) + cutoff_analysis_result = btrack.cutoff_analysis(int(options.maxgap), int(options.minlen), min_score=btrack.minvalue, max_score=int(options.cutoff_analysis_max), steps=int(options.cutoff_analysis_steps)) info("Write report...") if options.ofile: - fhd = open( os.path.join( options.outdir, options.ofile ), 'w' ) + fhd = open(os.path.join(options.outdir, options.ofile), 'w') else: - fhd = open ( os.path.join( options.outdir, "%s_l%d_g%d_cutoff_analysis.txt" % (options.oprefix,options.minlen,options.maxgap)), "w" ) - fhd.write( cutoff_analysis_result ) + fhd = open(os.path.join(options.outdir, "%s_l%d_g%d_cutoff_analysis.txt" % (options.oprefix,options.minlen, options.maxgap)), "w") + fhd.write(cutoff_analysis_result) info("Done") else: info("Call peaks from bedGraph...") - peaks = btrack.call_peaks(cutoff=float(options.cutoff),min_length=int(options.minlen),max_gap=int(options.maxgap),call_summits=options.call_summits) + peaks = btrack.call_peaks(cutoff=float(options.cutoff), min_length=int(options.minlen), max_gap=int(options.maxgap), call_summits=options.call_summits) info("Write peaks...") if options.ofile: options.oprefix = options.ofile - nf = open( os.path.join( options.outdir, options.ofile ), 'w' ) + nf = open(os.path.join(options.outdir, options.ofile), 'w') else: - nf = open ( os.path.join( options.outdir, "%s_c%.1f_l%d_g%d_peaks.narrowPeak" % (options.oprefix,options.cutoff,options.minlen,options.maxgap)), "w" ) + nf = open(os.path.join(options.outdir, "%s_c%.1f_l%d_g%d_peaks.narrowPeak" % (options.oprefix, options.cutoff, options.minlen, options.maxgap)), "w") peaks.write_to_narrowPeak(nf, name=options.oprefix.encode(), name_prefix=(options.oprefix+"_narrowPeak").encode(), score_column="score", trackline=options.trackline) info("Done") - - - diff --git a/MACS3/Commands/callpeak_cmd.py b/MACS3/Commands/callpeak_cmd.py index c7b4622c..024f539e 100644 --- a/MACS3/Commands/callpeak_cmd.py +++ b/MACS3/Commands/callpeak_cmd.py @@ -1,4 +1,4 @@ -# Time-stamp: <2020-11-28 17:06:30 Tao Liu> +# Time-stamp: <2024-10-02 15:58:36 Tao Liu> """Description: MACS 3 call peak main executable @@ -11,7 +11,6 @@ # python modules # ------------------------------------ -import os import sys from time import strftime import tempfile @@ -19,61 +18,65 @@ # ------------------------------------ # MACS3 python modules # ------------------------------------ -from MACS3.Utilities.Constants import * +from MACS3.Utilities.Constants import MACS_VERSION, MAX_PAIRNUM from MACS3.Utilities.OptValidator import opt_validate_callpeak from MACS3.Signal.Prob import binomial_cdf_inv -from MACS3.Signal.PeakModel import PeakModel,NotEnoughPairsException +from MACS3.Signal.PeakModel import PeakModel, NotEnoughPairsException from MACS3.Signal.PeakDetect import PeakDetect from MACS3.IO.OutputWriter import model2r_script + # ------------------------------------ # Main function # ------------------------------------ + + def check_names(treat, control, error_stream): """check common chromosome names""" tchrnames = set(treat.get_chr_names()) cchrnames = set(control.get_chr_names()) commonnames = tchrnames.intersection(cchrnames) - if len(commonnames)==0: + if len(commonnames) == 0: error_stream("No common chromosome names can be found from treatment and control!") error_stream("Please make sure that the treatment and control alignment files were generated by using the same genome assembly!") error_stream("Chromosome names in treatment: %s" % ",".join(sorted(tchrnames))) error_stream("Chromosome names in control: %s" % ",".join(sorted(cchrnames))) sys.exit() -def run( args ): + +def run(args): """The Main function/pipeline for MACS. """ # Parse options... - options = opt_validate_callpeak( args ) + options = opt_validate_callpeak(args) # end of parsing commandline options info = options.info warn = options.warn debug = options.debug error = options.error - - #0 output arguments + + # 0 output arguments info("\n"+options.argtxt) - options.PE_MODE = options.format in ('BAMPE','BEDPE') + options.PE_MODE = options.format in ('BAMPE', 'BEDPE') if options.PE_MODE: - tag = 'fragment' # call things fragments not tags + tag = 'fragment' # call things fragments not tags else: tag = 'tag' tempfile.tempdir = options.tempdir - #1 Read tag files + # 1 Read tag files info("#1 read %s files...", tag) if options.PE_MODE: - (treat, control) = load_frag_files_options (options) + (treat, control) = load_frag_files_options(options) else: - (treat, control) = load_tag_files_options (options) + (treat, control) = load_tag_files_options(options) if control is not None: # check if chromosome names are consistent. quit if not. check_names(treat, control, error) info("#1 %s size = %.1f", tag, options.tsize) - tagsinfo = "# %s size is determined as %d bps\n" % (tag, options.tsize) + tagsinfo = "# %s size is determined as %d bps\n" % (tag, options.tsize) t0 = treat.total tagsinfo += "# total %ss in treatment: %d\n" % (tag, t0) @@ -83,7 +86,7 @@ def run( args ): if options.keepduplicates != "all": if options.keepduplicates == "auto": info("#1 calculate max duplicate %ss in single position based on binomial distribution...", tag) - treatment_max_dup_tags = cal_max_dup_tags(options.gsize,t0) + treatment_max_dup_tags = cal_max_dup_tags(options.gsize, t0) info("#1 max_dup_tags based on binomial = %d" % (treatment_max_dup_tags)) else: info("#1 user defined the maximum %ss...", tag) @@ -114,7 +117,7 @@ def run( args ): if options.keepduplicates != "all": if options.keepduplicates == "auto": info("#1 for control, calculate max duplicate %ss in single position based on binomial distribution...", tag) - control_max_dup_tags = cal_max_dup_tags(options.gsize,c0) + control_max_dup_tags = cal_max_dup_tags(options.gsize, c0) info("#1 max_dup_tags based on binomial = %d" % (control_max_dup_tags)) else: info("#1 user defined the maximum %ss...", tag) @@ -140,7 +143,7 @@ def run( args ): c1 = c0 info("#1 finished!") - #2 Build Model + # 2 Build Model info("#2 Build Peak Model...") if options.nomodel: @@ -148,17 +151,17 @@ def run( args ): if options.PE_MODE: options.d = options.tsize else: - options.d=options.extsize + options.d = options.extsize info("#2 Use %d as fragment length" % (options.d)) if options.shift > 0: - info("#2 Sequencing ends will be shifted towards 3' by %d bp(s)" % (options.shift)) + info("#2 Sequencing ends will be shifted towards 3' by %d bp(s)" % options.shift) elif options.shift < 0: info("#2 Sequencing ends will be shifted towards 5' by %d bp(s)" % (options.shift * -1)) options.scanwindow=2*options.d # remove the effect of --bw else: - peakmodel = PeakModel(treatment = treat, - max_pairnum = MAX_PAIRNUM, - opt = options + peakmodel = PeakModel(treatment=treat, + max_pairnum=MAX_PAIRNUM, + opt=options ) try: peakmodel.build() @@ -168,16 +171,16 @@ def run( args ): debug("#2 d: %d" % (peakmodel.d)) debug("#2 scan_window: %d" % (peakmodel.scan_window)) info("#2 predicted fragment length is %d bps" % peakmodel.d) - info("#2 alternative fragment length(s) may be %s bps" % ','.join(map(str,peakmodel.alternative_d))) + info("#2 alternative fragment length(s) may be %s bps" % ','.join(map(str, peakmodel.alternative_d))) info("#2.2 Generate R script for model : %s" % (options.modelR)) - model2r_script(peakmodel,options.modelR,options.name) + model2r_script(peakmodel, options.modelR, options.name) options.d = peakmodel.d - options.scanwindow= 2*options.d + options.scanwindow = 2*options.d if options.d <= 2*options.tsize: warn("#2 Since the d (%.0f) calculated from paired-peaks are smaller than 2*tag length, it may be influenced by unknown sequencing problem!" % (options.d)) if options.onauto: - options.d=options.extsize - options.scanwindow=2*options.d + options.d = options.extsize + options.scanwindow = 2 * options.d warn("#2 MACS will use %d as EXTSIZE/fragment length d. NOTE: if the d calculated is still acceptable, please do not use --fix-bimodal option!" % (options.d)) else: warn("#2 You may need to consider one of the other alternative d(s): %s" % ','.join(map(str,peakmodel.alternative_d))) @@ -187,11 +190,11 @@ def run( args ): if not options.onauto: sys.exit(1) warn("#2 Skipped...") - options.d=options.extsize - options.scanwindow=2*options.d + options.d = options.extsize + options.scanwindow = 2 * options.d warn("#2 Since --fix-bimodal is set, MACS will use %d as fragment length" % (options.d)) - #3 Call Peaks + # 3 Call Peaks info("#3 Call peaks...") if options.nolambda: info("# local lambda is disabled!") @@ -242,19 +245,19 @@ def run( args ): # false, we will scale control to treatment. options.tocontrol = False - peakdetect = PeakDetect(treat = treat, - control = control, - opt = options + peakdetect = PeakDetect(treat=treat, + control=control, + opt=options ) peakdetect.call_peaks() # filter out low FE peaks - peakdetect.peaks.filter_fc( fc_low = options.fecutoff ) + peakdetect.peaks.filter_fc(fc_low=options.fecutoff) #4 output #4.1 peaks in XLS info("#4 Write output xls file... %s" % (options.peakxls)) - ofhd_xls = open( options.peakxls, "w" ) + ofhd_xls = open(options.peakxls, "w") ofhd_xls.write("# This file is generated by MACS version %s\n" % (MACS_VERSION)) ofhd_xls.write(options.argtxt+"\n") ofhd_xls.write(tagsinfo) @@ -272,50 +275,57 @@ def run( args ): if options.nolambda: ofhd_xls.write("# local lambda is disabled!\n") # pass write method so we can print too, and include name - peakdetect.peaks.write_to_xls(ofhd_xls, name = options.name.encode()) + peakdetect.peaks.write_to_xls(ofhd_xls, name=options.name.encode()) ofhd_xls.close() #4.2 peaks in BED - if options.log_pvalue != None: + if options.log_pvalue is not None: score_column = "pscore" - elif options.log_qvalue != None: + elif options.log_qvalue is not None: score_column = "qscore" - #4.2 peaks in narrowPeak + # 4.2 peaks in narrowPeak if not options.broad: info("#4 Write peak in narrowPeak format file... %s" % (options.peakNarrowPeak)) - ofhd_bed = open( options.peakNarrowPeak, "w" ) - peakdetect.peaks.write_to_narrowPeak (ofhd_bed, name_prefix=b"%s_peak_", name=options.name.encode(), score_column=score_column, trackline=options.trackline ) + ofhd_bed = open(options.peakNarrowPeak, "w") + peakdetect.peaks.write_to_narrowPeak(ofhd_bed, name_prefix=b"%s_peak_", name=options.name.encode(), score_column=score_column, trackline=options.trackline) ofhd_bed.close() - #4.2-2 summits in BED + # 4.2-2 summits in BED info("#4 Write summits bed file... %s" % (options.summitbed)) - ofhd_summits = open( options.summitbed, "w" ) - peakdetect.peaks.write_to_summit_bed (ofhd_summits, name_prefix="%s_peak_".encode(), name=options.name.encode(), - description=("Summits for %s (Made with MACS v2, " + strftime("%x") + ")").encode(), - score_column=score_column, trackline=options.trackline ) + ofhd_summits = open(options.summitbed, "w") + peakdetect.peaks.write_to_summit_bed(ofhd_summits, + name_prefix="%s_peak_".encode(), + name=options.name.encode(), + description=("Summits for %s (Made with MACS v3, " + + strftime("%x") + + ")").encode(), + score_column=score_column, + trackline=options.trackline) ofhd_summits.close() - #4.2 broad peaks in bed12 or gappedPeak + # 4.2 broad peaks in bed12 or gappedPeak else: info("#4 Write broad peak in broadPeak format file... %s" % (options.peakBroadPeak)) - ofhd_bed = open( options.peakBroadPeak, "w" ) - peakdetect.peaks.write_to_broadPeak (ofhd_bed, name_prefix=b"%s_peak_", name=options.name.encode(), description=options.name.encode(), score_column=score_column, trackline=options.trackline) + ofhd_bed = open(options.peakBroadPeak, "w") + peakdetect.peaks.write_to_broadPeak(ofhd_bed, name_prefix=b"%s_peak_", name=options.name.encode(), description=options.name.encode(), score_column=score_column, trackline=options.trackline) ofhd_bed.close() info("#4 Write broad peak in bed12/gappedPeak format file... %s" % (options.peakGappedPeak)) - ofhd_bed = open( options.peakGappedPeak, "w" ) - peakdetect.peaks.write_to_gappedPeak (ofhd_bed, name_prefix=b"%s_peak_", name=options.name.encode(), description=options.name.encode(), score_column=score_column, trackline=options.trackline) + ofhd_bed = open(options.peakGappedPeak, "w") + peakdetect.peaks.write_to_gappedPeak(ofhd_bed, name_prefix=b"%s_peak_", name=options.name.encode(), description=options.name.encode(), score_column=score_column, trackline=options.trackline) ofhd_bed.close() info("Done!") -def cal_max_dup_tags ( genome_size, tags_number, p=1e-5 ): + +def cal_max_dup_tags(genome_size, tags_number, p=1e-5): """Calculate the maximum duplicated tag number based on genome size, total tag number and a p-value based on binomial distribution. Brute force algorithm to calculate reverse CDF no more than MAX_LAMBDA(100000). """ - return binomial_cdf_inv(1-p,tags_number,1.0/genome_size) + return binomial_cdf_inv(1-p, tags_number, 1.0/genome_size) -def load_frag_files_options ( options ): + +def load_frag_files_options(options): """From the options, load treatment fragments and control fragments (if available). """ @@ -323,13 +333,11 @@ def load_frag_files_options ( options ): tp = options.parser(options.tfile[0], buffer_size=options.buffer_size) treat = tp.build_petrack() - #treat.sort() if len(options.tfile) > 1: # multiple input for tfile in options.tfile[1:]: tp = options.parser(tfile, buffer_size=options.buffer_size) - treat = tp.append_petrack( treat ) - #treat.sort() + treat = tp.append_petrack(treat) treat.finalize() options.tsize = tp.d @@ -338,23 +346,22 @@ def load_frag_files_options ( options ): cp = options.parser(options.cfile[0], buffer_size=options.buffer_size) control = cp.build_petrack() control_d = cp.d - #control.sort() if len(options.cfile) > 1: # multiple input for cfile in options.cfile[1:]: cp = options.parser(cfile, buffer_size=options.buffer_size) - control = cp.append_petrack( control ) - #control.sort() + control = cp.append_petrack(control) control.finalize() else: control = None options.info("#1 mean fragment size is determined as %.1f bp from treatment" % options.tsize) -# options.info("#1 fragment size variance is determined as %d bp from treatment" % tp.variance) + # options.info("#1 fragment size variance is determined as %d bp from treatment" % tp.variance) if control is not None: options.info("#1 note: mean fragment size in control is %.1f bp -- value ignored" % control_d) return (treat, control) -def load_tag_files_options ( options ): + +def load_tag_files_options(options): """From the options, load treatment tags and control tags (if available). """ @@ -364,25 +371,21 @@ def load_tag_files_options ( options ): ttsize = tp.tsize() options.tsize = ttsize treat = tp.build_fwtrack() - #treat.sort() if len(options.tfile) > 1: # multiple input for tfile in options.tfile[1:]: tp = options.parser(tfile, buffer_size=options.buffer_size) - treat = tp.append_fwtrack( treat ) - #treat.sort() + treat = tp.append_fwtrack(treat) treat.finalize() if options.cfile: options.info("#1.2 read input tags...") control = options.parser(options.cfile[0], buffer_size=options.buffer_size).build_fwtrack() - #control.sort() if len(options.cfile) > 1: # multiple input for cfile in options.cfile[1:]: cp = options.parser(cfile, buffer_size=options.buffer_size) - control = cp.append_fwtrack( control ) - #control.sort() + control = cp.append_fwtrack(control) control.finalize() else: control = None diff --git a/MACS3/Commands/callvar_cmd.py b/MACS3/Commands/callvar_cmd.py index 1ae96c7a..7f1a8097 100644 --- a/MACS3/Commands/callvar_cmd.py +++ b/MACS3/Commands/callvar_cmd.py @@ -1,4 +1,4 @@ -# Time-stamp: <2024-05-07 11:47:04 Tao Liu> +# Time-stamp: <2024-10-02 16:34:23 Tao Liu> """Description: Call variants directly @@ -23,13 +23,13 @@ from functools import partial import multiprocessing as mp -from time import time +# from time import time from math import ceil # ------------------------------------ # own python modules # ------------------------------------ -from MACS3.Utilities.Constants import * +from MACS3.Utilities.Constants import MACS_VERSION from MACS3.Utilities.OptValidator import opt_validate_callvar from MACS3.IO.PeakIO import PeakIO from MACS3.IO.BAM import BAMaccessor @@ -37,7 +37,7 @@ from MACS3.Signal.PeakVariants import PeakVariants -VCFHEADER_0="""##fileformat=VCFv4.1 +VCFHEADER_0 = """##fileformat=VCFv4.1 ##fileDate=%s ##source=MACS_V%s ##Program_Args=%s @@ -66,7 +66,7 @@ ##FORMAT= ##FORMAT=""" -VCFHEADER="""##fileformat=VCFv4.1 +VCFHEADER = """##fileformat=VCFv4.1 ##fileDate=%s ##source=MACS_V%s ##Program_Args=%s @@ -93,28 +93,29 @@ # Main function # ------------------------------------ + def check_names(treat, control, error_stream): """check common chromosome names""" tchrnames = set(treat.get_chr_names()) cchrnames = set(control.get_chr_names()) commonnames = tchrnames.intersection(cchrnames) - if len(commonnames)==0: + if len(commonnames) == 0: error_stream("No common chromosome names can be found from treatment and control! Check your input files! MACS will quit...") error_stream("Chromosome names in treatment: %s" % ",".join(sorted(tchrnames))) error_stream("Chromosome names in control: %s" % ",".join(sorted(cchrnames))) sys.exit() -def run( args ): +def run(args): """The Main function/pipeline for MACS """ - options = opt_validate_callvar( args ) + options = opt_validate_callvar(args) info = options.info - warn = options.warn - debug = options.debug - error = options.error + # warn = options.warn + # debug = options.debug + # error = options.error peakbedfile = options.peakbed tfile = options.tfile @@ -123,7 +124,7 @@ def run( args ): min_altallele_count = options.altalleleMinCount max_allowed_ar = options.maxAR NP = options.np - if NP<=0: + if NP <= 0: NP = 1 min_homo_GQ = options.GQCutoffHomo min_heter_GQ = options.GQCutoffHetero @@ -133,139 +134,140 @@ def run( args ): # parameter for assembly fermiMinOverlap = options.fermiMinOverlap fermi = options.fermi - - peakio = open( peakbedfile ) + + peakio = open(peakbedfile) peaks = PeakIO() i = 0 - for l in peakio: - fs = l.rstrip().split() + for t_peak in peakio: + fs = t_peak.rstrip().split() i += 1 - peaks.add( fs[0].encode(), int(fs[1]), int(fs[2]), name=b"%d" % i ) + peaks.add(fs[0].encode(), int(fs[1]), int(fs[2]), name=b"%d" % i) peaks.sort() - chrs = peaks.get_chr_names() + # chrs = peaks.get_chr_names() - tbam = BAMaccessor( tfile ) + tbam = BAMaccessor(tfile) if cfile: - cbam = BAMaccessor( cfile ) + cbam = BAMaccessor(cfile) assert tbam.get_chromosomes()[0] in cbam.get_chromosomes() or cbam.get_chromosomes()[0] in tbam.get_chromosomes(), Exception("It seems Treatment and Control BAM use different naming for chromosomes! Check headers of both files.") - #assert tbam.get_chromosomes() == cbam.get_chromosomes(), Exception("Treatment and Control BAM files have different orders of sorted chromosomes! Please check BAM Headers and re-sort BAM files.") + # assert tbam.get_chromosomes() == cbam.get_chromosomes(), Exception("Treatment and Control BAM files have different orders of sorted chromosomes! Please check BAM Headers and re-sort BAM files.") else: cbam = None - - ra_collections = [] + # ra_collections = [] # prepare and write header of output file (.vcf) ovcf = open(options.ofile, "w") - tmpcmdstr = " --fermi "+ fermi+ " --fermi-overlap "+str(fermiMinOverlap) - ovcf.write ( VCFHEADER % (datetime.date.today().strftime("%Y%m%d"), MACS_VERSION, " ".join(sys.argv[1:] + ["-Q", str(minQ), "-D", str(maxDuplicate), "--max-ar", str(max_allowed_ar), "--top2alleles-mratio", str(top2allelesminr), "--top2allele-count", str(min_altallele_count), "-g", str(min_heter_GQ), "-G", str(min_homo_GQ), tmpcmdstr]) ) + "\n" ) + tmpcmdstr = " --fermi " + fermi + " --fermi-overlap "+str(fermiMinOverlap) + ovcf.write(VCFHEADER % (datetime.date.today().strftime("%Y%m%d"), MACS_VERSION, " ".join(sys.argv[1:] + ["-Q", str(minQ), "-D", str(maxDuplicate), "--max-ar", str(max_allowed_ar), "--top2alleles-mratio", str(top2allelesminr), "--top2allele-count", str(min_altallele_count), "-g", str(min_heter_GQ), "-G", str(min_homo_GQ), tmpcmdstr])) + "\n") for (chrom, chrlength) in tbam.get_rlengths().items(): - ovcf.write( "##contig=\n" % ( chrom.decode(), chrlength ) ) - ovcf.write ( "\t".join( ("#CHROM","POS","ID","REF","ALT","QUAL","FILTER","INFO","FORMAT","SAMPLE") ) + "\n" ) + ovcf.write("##contig=\n" % (chrom.decode(), chrlength)) + ovcf.write("\t".join(("#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT", "SAMPLE")) + "\n") # to get time - t_total = 0 - t_prepare_ra = 0 - t_assemble = 0 - #t_call_top2alleles = 0 - #t_call_lnL = 0 - t_call_variants = 0 - t_call_GT = 0 + # t_total = 0 + # t_prepare_ra = 0 + # t_assemble = 0 + # t_call_top2alleles = 0 + # t_call_lnL = 0 + # t_call_variants = 0 + # t_call_GT = 0 #t_call_to_vcf = 0 - t_total_0 = time() + # t_total_0 = time() for chrom in sorted(tbam.get_chromosomes()): - peaks_chr = peaks.get_data_from_chrom( chrom ) + peaks_chr = peaks.get_data_from_chrom(chrom) for peak in peaks_chr: # note, when we extract reads from BAM within a peak # region, we assume BAM should be sorted and the BAM # should be generated from "samtools view -L" process. # print ( "---begin of peak---") - info ( f"Peak: {chrom.decode()} {peak['start']} {peak['end']}" ) + info(f"Peak: {chrom.decode()} {peak['start']} {peak['end']}") - flag_todo_lassembly = False + # flag_todo_lassembly = False - t0 = time() + # t0 = time() try: if cbam: - ra_collection = RACollection( chrom, peak, tbam.get_reads_in_region( chrom, peak["start"], peak["end"], maxDuplicate=maxDuplicate ), cbam.get_reads_in_region( chrom, peak["start"], peak["end"], maxDuplicate=maxDuplicate) ) + ra_collection = RACollection(chrom, peak, + tbam.get_reads_in_region(chrom, peak["start"], peak["end"], maxDuplicate=maxDuplicate), + cbam.get_reads_in_region(chrom, peak["start"], peak["end"], maxDuplicate=maxDuplicate)) else: - ra_collection = RACollection( chrom, peak, tbam.get_reads_in_region( chrom, peak["start"], peak["end"], maxDuplicate=maxDuplicate ) ) - except: - info ("No reads found in this peak. Skipped" ) + ra_collection = RACollection(chrom, peak, + tbam.get_reads_in_region(chrom, peak["start"], peak["end"], maxDuplicate=maxDuplicate)) + except Exception: + info("No reads found in this peak. Skipped") # while there is no reads in peak region, simply skip it. continue - - ra_collection.remove_outliers( percent = 5 ) - t_prepare_ra += time() - t0 + + ra_collection.remove_outliers(percent=5) + # t_prepare_ra += time() - t0 # print ( "Reads in Peak:") # print ( ra_collection.get_FASTQ().decode() ) s = ra_collection["peak_refseq"] - peak_variants = PeakVariants( chrom.decode(), peak["start"], peak["end"], s ) - + peak_variants = PeakVariants(chrom.decode(), peak["start"], peak["end"], s) if fermi == "auto" or fermi == "off": # first pass to call variant w/o assembly # multiprocessing the following part - t_call_variants_0 = time() - info ( " Call variants w/o assembly") + # t_call_variants_0 = time() + info(" Call variants w/o assembly") # -- now make multi processes # divide right-left into NP parts - window_size = ceil( ( ra_collection["right"] - ra_collection["left"] ) / NP ) + window_size = ceil((ra_collection["right"] - ra_collection["left"]) / NP) - P = mp.Pool( NP ) + P = mp.Pool(NP) # this partial function will only be used in multiprocessing - p_call_variants_at_range = partial(call_variants_at_range, s=s, collection=ra_collection, top2allelesminr=top2allelesminr, max_allowed_ar = max_allowed_ar, min_altallele_count = min_altallele_count, min_homo_GQ = min_homo_GQ, min_heter_GQ = min_heter_GQ, minQ=minQ) + p_call_variants_at_range = partial(call_variants_at_range, s=s, collection=ra_collection, top2allelesminr=top2allelesminr, max_allowed_ar=max_allowed_ar, min_altallele_count=min_altallele_count, min_homo_GQ=min_homo_GQ, min_heter_GQ=min_heter_GQ, minQ=minQ) ranges = [] - for i in range( NP ): - l = i * window_size + ra_collection["left"] - r = min( (i + 1) * window_size + ra_collection["left"], ra_collection["right"] ) - ranges.append( (l, r) ) + for i in range(NP): + l_d = i * window_size + ra_collection["left"] + r_d = min((i + 1) * window_size + ra_collection["left"], ra_collection["right"]) + ranges.append((l_d, r_d)) - mapresults = P.map_async( p_call_variants_at_range, ranges ) + mapresults = P.map_async(p_call_variants_at_range, ranges) P.close() P.join() results = mapresults.get(timeout=window_size*300) - for i in range( NP ): - for result in results[ i ]: - peak_variants.add_variant( result[0], result[1] ) + for i in range(NP): + for result in results[i]: + peak_variants.add_variant(result[0], result[1]) - t_call_variants += time() - t_call_variants_0 + # t_call_variants += time() - t_call_variants_0 # Next, check if we should do local assembly - if ( fermi == "auto" and ( peak_variants.has_indel() or peak_variants.has_refer_biased_01() ) ) or fermi == "on": + if (fermi == "auto" and (peak_variants.has_indel() or peak_variants.has_refer_biased_01())) or fermi == "on": #print( peak_variants.has_indel() ) #print( peak_variants.has_refer_biased_01() ) - + # invoke fermi to assemble local sequence and filter out those can not be mapped to unitigs. - info ( " Try to call variants w/ fermi-lite assembly") - unitig_collection = ra_collection.build_unitig_collection( fermiMinOverlap ) + info(" Try to call variants w/ fermi-lite assembly") + unitig_collection = ra_collection.build_unitig_collection(fermiMinOverlap) if unitig_collection == -1: info(" Too many mismatches found while assembling the sequence, we will skip this region entirely!") continue elif unitig_collection == 0: - info ( " Failed to assemble unitigs, fall back to previous results" ) + info(" Failed to assemble unitigs, fall back to previous results") if peak_variants.n_variants() > 0: peak_variants.fix_indels() - ovcf.write( peak_variants.toVCF() ) + ovcf.write(peak_variants.toVCF()) continue # uncomment the following to print those assembled unitigs and their alignments to reference genome - #for u in unitig_collection["URAs_list"]: - # print( u["seq"].decode(), u["lpos"], u["rpos"], u["count"] ) - # print( "a",u["unitig_aln"].decode() ) - # print( "r",u["reference_aln"].decode() ) + # for u in unitig_collection["URAs_list"]: + # print(u["seq"].decode(), u["lpos"], u["rpos"], u["count"]) + # print("a",u["unitig_aln"].decode()) + # print("r",u["reference_aln"].decode()) else: # if we do not assemble, write results now if peak_variants.n_variants() > 0: - peak_variants.fix_indels() - ovcf.write( peak_variants.toVCF() ) + peak_variants.fix_indels() + ovcf.write(peak_variants.toVCF()) continue # reach here only if we need l assembly and the assembly returns result @@ -274,79 +276,76 @@ def run( args ): # revisit all refer_biased_01 now. We do not use # multiprocessing here for simplicity since there won't be # too many in a peak region. - if ( fermi == "auto" and ( not peak_variants.has_indel() ) and peak_variants.has_refer_biased_01() ): + if (fermi == "auto" and (not peak_variants.has_indel()) and peak_variants.has_refer_biased_01()): pos_tobe_revisit = peak_variants.get_refer_biased_01s() for i in pos_tobe_revisit: - ref_nt = chr(s[ i-ra_collection["left"] ] ).encode() + ref_nt = chr(s[i-ra_collection["left"]]).encode() if ref_nt == b'N': - peak_variants.remove_variant( i ) + peak_variants.remove_variant(i) continue - PRI = unitig_collection.get_PosReadsInfo_ref_pos ( i, ref_nt, Q=minQ ) - if PRI.raw_read_depth( opt="T" ) == 0: # skip if coverage is 0 - peak_variants.remove_variant( i ) + PRI = unitig_collection.get_PosReadsInfo_ref_pos(i, ref_nt, Q=minQ) + if PRI.raw_read_depth(opt="T") == 0: # skip if coverage is 0 + peak_variants.remove_variant(i) continue - PRI.update_top_alleles( top2allelesminr, min_altallele_count, max_allowed_ar ) - PRI.call_GT( max_allowed_ar ) + PRI.update_top_alleles(top2allelesminr, min_altallele_count, max_allowed_ar) + PRI.call_GT(max_allowed_ar) PRI.apply_GQ_cutoff(min_homo_GQ, min_heter_GQ) if not PRI.filterflag(): - peak_variants.replace_variant( i, PRI.toVariant() ) + peak_variants.replace_variant(i, PRI.toVariant()) else: - peak_variants.remove_variant( i ) + peak_variants.remove_variant(i) if peak_variants.n_variants() > 0: peak_variants.fix_indels() - ovcf.write( peak_variants.toVCF() ) + ovcf.write(peak_variants.toVCF()) continue # in this case, we call variants at every locations in the peak based on local assembly. - if ( fermi == "on" or ( fermi == "auto" and peak_variants.has_indel() ) ): - peak_variants = PeakVariants( chrom.decode(), peak["start"], peak["end"], s ) #reset + if (fermi == "on" or (fermi == "auto" and peak_variants.has_indel())): + peak_variants = PeakVariants(chrom.decode(), peak["start"], peak["end"], s) # reset # --- make multi processes # divide right-left into NP parts - window_size = ceil( ( ra_collection["right"] - ra_collection["left"] ) / NP ) - P = mp.Pool( NP ) + window_size = ceil((ra_collection["right"] - ra_collection["left"]) / NP) + P = mp.Pool(NP) # this partial function will only be used in multiprocessing - p_call_variants_at_range = partial(call_variants_at_range, s=s, collection=unitig_collection, top2allelesminr=top2allelesminr, max_allowed_ar = max_allowed_ar, min_altallele_count = min_altallele_count, min_homo_GQ = min_homo_GQ, min_heter_GQ = min_heter_GQ, minQ=minQ) + p_call_variants_at_range = partial(call_variants_at_range, s=s, collection=unitig_collection, top2allelesminr=top2allelesminr, max_allowed_ar=max_allowed_ar, min_altallele_count=min_altallele_count, min_homo_GQ=min_homo_GQ, min_heter_GQ=min_heter_GQ, minQ=minQ) ranges = [] - for i in range( NP ): - l = i * window_size + ra_collection["left"] - r = min( (i + 1) * window_size + ra_collection["left"], ra_collection["right"] ) - ranges.append( (l, r) ) + for i in range(NP): + l_d = i * window_size + ra_collection["left"] + r_d = min((i + 1) * window_size + ra_collection["left"], ra_collection["right"]) + ranges.append((l_d, r_d)) - mapresults = P.map_async( p_call_variants_at_range, ranges ) + mapresults = P.map_async(p_call_variants_at_range, ranges) P.close() P.join() - results = mapresults.get(timeout=window_size*300) - for i in range( NP ): - for result in results[ i ]: - peak_variants.add_variant( result[0], result[1] ) - + results = mapresults.get(timeout=window_size*300) + for i in range(NP): + for result in results[i]: + peak_variants.add_variant(result[0], result[1]) if peak_variants.n_variants() > 0: peak_variants.fix_indels() - ovcf.write( peak_variants.toVCF() ) + ovcf.write(peak_variants.toVCF()) - #print ("time to retrieve read alignment information from BAM:",t_prepare_ra,"(",round( 100 * t_prepare_ra/t_total, 2),"% )") + # print ("time to retrieve read alignment information from BAM:",t_prepare_ra,"(",round( 100 * t_prepare_ra/t_total, 2),"% )") return -def call_variants_at_range ( lr, s, collection, top2allelesminr, max_allowed_ar, min_altallele_count, min_homo_GQ, min_heter_GQ, minQ ): -#def call_variants_at_range ( lr, chrom, s, collection, top2allelesminr, max_allowed_ar, min_homo_GQ, min_heter_GQ ): + +def call_variants_at_range(lr, s, collection, top2allelesminr, max_allowed_ar, min_altallele_count, min_homo_GQ, min_heter_GQ, minQ): result = [] - for i in range( lr[ 0 ], lr[ 1 ] ): - ref_nt = chr(s[ i-collection["left"] ] ).encode() + for i in range(lr[0], lr[1]): + ref_nt = chr(s[i-collection["left"]]).encode() if ref_nt == b'N': continue - PRI = collection.get_PosReadsInfo_ref_pos ( i, ref_nt, Q=minQ ) - if PRI.raw_read_depth( opt="T" ) == 0: # skip if coverage is 0 + PRI = collection.get_PosReadsInfo_ref_pos(i, ref_nt, Q=minQ) + if PRI.raw_read_depth(opt="T") == 0: # skip if coverage is 0 continue - PRI.update_top_alleles( top2allelesminr, min_altallele_count, max_allowed_ar ) + PRI.update_top_alleles(top2allelesminr, min_altallele_count, max_allowed_ar) if not PRI.filterflag(): #PRI.update_top_alleles( top2allelesminr ) - PRI.call_GT( max_allowed_ar ) + PRI.call_GT(max_allowed_ar) PRI.apply_GQ_cutoff(min_homo_GQ, min_heter_GQ) if not PRI.filterflag(): - result.append( ( i, PRI.toVariant() ) ) + result.append((i, PRI.toVariant())) return result - - diff --git a/MACS3/Commands/cmbreps_cmd.py b/MACS3/Commands/cmbreps_cmd.py index d50a7344..c7cf9fc2 100644 --- a/MACS3/Commands/cmbreps_cmd.py +++ b/MACS3/Commands/cmbreps_cmd.py @@ -1,4 +1,4 @@ -# Time-stamp: <2024-05-15 11:16:04 Tao Liu> +# Time-stamp: <2024-10-02 16:42:02 Tao Liu> """Description: combine replicates @@ -7,9 +7,7 @@ the distribution). """ -import sys import os -from math import log as mlog from MACS3.IO import BedGraphIO from MACS3.Utilities.OptValidator import opt_validate_cmbreps @@ -26,28 +24,28 @@ # Main function # ------------------------------------ -def run( options ): - options = opt_validate_cmbreps( options ) + +def run(options): + options = opt_validate_cmbreps(options) info = options.info - warn = options.warn - debug = options.debug - error = options.error - + # warn = options.warn + # debug = options.debug + # error = options.error + info("Read and build bedGraph for each replicate...") reps = [] i = 1 for ifile in options.ifile: info("Read file #%d" % i) - reps.append( BedGraphIO.bedGraphIO( ifile ).read_bedGraph() ) + reps.append(BedGraphIO.bedGraphIO(ifile).read_bedGraph()) i += 1 # first two reps info("combining tracks 1-%i with method '%s'" % (i - 1, options.method)) - cmbtrack = reps[ 0 ].overlie( [reps[ j ] for j in range(1, i - 1)], func=options.method ) + cmbtrack = reps[0].overlie([reps[j] for j in range(1, i - 1)], func=options.method) # now output - ofile = BedGraphIO.bedGraphIO( os.path.join( options.outdir, options.ofile ), data = cmbtrack ) + ofile = BedGraphIO.bedGraphIO(os.path.join(options.outdir, options.ofile), data=cmbtrack) info("Write bedGraph of combined scores...") - ofile.write_bedGraph(name="%s_combined_scores" % (options.method.upper()),description="Scores calculated by %s" % (options.method.upper())) + ofile.write_bedGraph(name="%s_combined_scores" % (options.method.upper()), description="Scores calculated by %s" % (options.method.upper())) info("Finished '%s'! Please check '%s'!" % (options.method, ofile.bedGraph_filename)) - diff --git a/MACS3/Commands/diffpeak_cmd.py b/MACS3/Commands/diffpeak_cmd.py deleted file mode 100644 index 7273697b..00000000 --- a/MACS3/Commands/diffpeak_cmd.py +++ /dev/null @@ -1,254 +0,0 @@ -# Time-stamp: <2020-11-24 16:48:57 Tao Liu> - -"""Description: obsolete function - -This code is free software; you can redistribute it and/or modify it -under the terms of the BSD License (see the file LICENSE included with -the distribution). -""" - -# ------------------------------------ -# python modules -# ------------------------------------ - -import os -import sys -from time import strftime - -# ------------------------------------ -# own python modules -# ------------------------------------ -# from MACS3.IO import cBedGraphIO -# from MACS3.IO.cDiffScore import DiffScoreTrackI -# from MACS3.IO.cPeakIO import PeakIO -# from MACS3.OptValidator import opt_validate_diffpeak -# from MACS3.Prob import binomial_cdf_inv -# from MACS3.PeakModel import PeakModel,NotEnoughPairsException -# from MACS3.PeakDetect import PeakDetect -# from MACS3.Constants import * -# ------------------------------------ -# constants -# ------------------------------------ - -# ------------------------------------ -# Misc functions -# ------------------------------------ -import logging -import MACS3.Utilities.Logger - -logger = logging.getLogger(__name__) -debug = logger.debug -info = logger.info -# ------------------------------------ -# Main function -# ------------------------------------ -def run( args ): - """The Differential function/pipeline for MACS. - - """ - return -# # Parse options... -# options = opt_validate_diffpeak( args ) -# #0 output arguments -# # info("\n"+options.argtxt) - -# ofile_prefix = options.name - -# # check if tag files exist -# with open(options.t1bdg) as f: pass -# with open(options.c1bdg) as f: pass -# with open(options.t2bdg) as f: pass -# with open(options.c2bdg) as f: pass - -# if not options.peaks1 == '': -# info("Read peaks for condition 1...") -# p1io = PeakIO() -# with open(options.peaks1, 'rU') as f: -# p1io.read_from_xls(f) - -# if not options.peaks2 == '': -# info("Read peaks for condition 2...") -# p2io = PeakIO() -# with open(options.peaks2, 'rU') as f: -# p2io.read_from_xls(f) - -# #1 Read tag files -# info("Read and build treatment 1 bedGraph...") -# t1bio = cBedGraphIO.bedGraphIO(options.t1bdg) -# t1btrack = t1bio.build_bdgtrack() - -# info("Read and build control 1 bedGraph...") -# c1bio = cBedGraphIO.bedGraphIO(options.c1bdg) -# c1btrack = c1bio.build_bdgtrack() - -# if len(options.depth) >=2: -# depth1 = options.depth[0] -# depth2 = options.depth[1] -# else: -# depth1 = options.depth[0] -# depth2 = depth1 - -# info("Read and build treatment 2 bedGraph...") -# t2bio = cBedGraphIO.bedGraphIO(options.t2bdg) -# t2btrack = t2bio.build_bdgtrack() - -# info("Read and build control 2 bedGraph...") -# c2bio = cBedGraphIO.bedGraphIO(options.c2bdg) -# c2btrack = c2bio.build_bdgtrack() - -# #3 Call Peaks - -# diffscore = DiffScoreTrackI( t1btrack, -# c1btrack, -# t2btrack, -# c2btrack, -# depth1, depth2 ) -# diffscore.finalize() -# if options.call_peaks: -# diffscore.set_track_score_method(options.track_score_method) -# info("Calling peaks") -# if options.track_score_method == 'p': -# diffscore.call_peaks(cutoff = options.peaks_log_pvalue, -# min_length = options.pminlen) -# elif options.track_score_method == 'q': -# diffscore.call_peaks(cutoff = options.peaks_log_qvalue, -# min_length = options.pminlen) -# else: -# raise NotImplementedError -# else: -# info("Using existing peaks") -# diffscore.store_peaks(p1io, p2io) -# info("Rebuilding chromosomes") -# diffscore.rebuild_chromosomes() -# diffscore.annotate_peaks() - -# info("Calling differentially occupied peaks") -# if options.score_method == 'p': -# diffscore.call_diff_peaks(cutoff = options.log_pvalue, -# min_length = options.dminlen, -# score_method = options.score_method) -# if options.score_method == 'q': -# diffscore.call_diff_peaks(cutoff = options.log_qvalue, -# min_length = options.dminlen, -# score_method = options.score_method) -# # diffscore.print_some_peaks() -# # diffscore.print_diff_peaks() - -# info("Write output xls and BED files...") -# ofhd_xls = open( os.path.join( options.outdir, options.peakxls), "w" ) -# ofhd_xls.write("# This file is generated by MACS version, using the diffpeak module %s\n" % (MACS_VERSION)) -# ofhd_xls.write( options.argtxt+"\n" ) -# ofhd_bed = open( os.path.join( options.outdir, options.peakbed), "w" ) - -# # pass write method so we can print too, and include name -# diffscore.write_peaks(xls=ofhd_xls, bed=ofhd_bed, -# name = options.name, name_prefix="%s_peak_", -# description="Peaks for %s (Made with MACS v2, " + strftime("%x") + ")", -# trackline=options.trackline) -# ofhd_xls.close() -# ofhd_bed.close() - -# if diffscore.has_peakio(): -# info("Write annotated peak xls files...") -# ofhd_xls1 = open( os.path.join( options.outdir, options.peak1xls), "w" ) -# ofhd_xls1.write("# This file is generated by MACS version, using the diffpeak module %s\n" % (MACS_VERSION)) -# ofhd_xls1.write(options.argtxt+"\n") -# ofhd_xls2 = open( os.path.join( options.outdir, options.peak2xls), "w" ) -# ofhd_xls2.write("# This file is generated by MACS version, using the diffpeak module %s\n" % (MACS_VERSION)) -# ofhd_xls2.write(options.argtxt+"\n") -# diffscore.write_peaks_by_summit(ofhd_xls1, ofhd_xls2, -# name = options.name, name_prefix="%s_peak_") -# ofhd_xls1.close() -# ofhd_xls2.close() - -# if options.store_bdg: -# info("#4 Write output bedgraph files...") -# ofhd_logLR = open( os.path.join( options.outdir, options.bdglogLR), "w" ) -# ofhd_pvalue = open( os.path.join( options.outdir, options.bdgpvalue), "w" ) -# ofhd_logFC = open( os.path.join( options.outdir, options.bdglogFC), "w" ) -# diffscore.write_bedgraphs(logLR=ofhd_logLR, pvalue=ofhd_pvalue, -# logFC=ofhd_logFC, name = options.name, -# description=" for %s (Made with MACS v2, " + strftime("%x") + ")", -# trackline=options.trackline) -# ofhd_logLR.close() -# ofhd_pvalue.close() -# ofhd_logFC.close() - - -# def cal_max_dup_tags ( genome_size, tags_number, p=1e-5 ): -# """Calculate the maximum duplicated tag number based on genome -# size, total tag number and a p-value based on binomial -# distribution. Brute force algorithm to calculate reverse CDF no -# more than MAX_LAMBDA(100000). - -# """ -# return binomial_cdf_inv(1-p,tags_number,1.0/genome_size) - -# def load_frag_files_options ( options ): -# """From the options, load treatment fragments and control fragments (if available). - -# """ -# options.info("#1 read treatment fragments...") - -# tp = options.parser(options.tfile[0]) -# treat = tp.build_petrack() -# treat.sort() -# if len(options.tfile) > 1: -# # multiple input -# for tfile in options.tfile[1:]: -# tp = options.parser(tfile) -# treat = tp.append_petrack( treat ) -# treat.sort() - -# options.tsize = tp.d -# if options.cfile: -# options.info("#1.2 read input fragments...") -# cp = options.parser(options.cfile[0]) -# control = cp.build_petrack() -# control_d = cp.d -# control.sort() -# if len(options.cfile) > 1: -# # multiple input -# for cfile in options.cfile[1:]: -# cp = options.parser(cfile) -# control = cp.append_petrack( control ) -# control.sort() -# else: -# control = None -# options.info("#1 mean fragment size is determined as %d bp from treatment" % options.tsize) -# if control is not None: -# options.info("#1 note: mean fragment size in control is %d bp -- value ignored" % control_d) -# return (treat, control) - -# def load_tag_files_options ( options ): -# """From the options, load treatment tags and control tags (if available). - -# """ -# options.info("#1 read treatment tags...") -# tp = options.parser(options.tfile[0]) -# if not options.tsize: # override tsize if user specified --tsize -# ttsize = tp.tsize() -# options.tsize = ttsize -# treat = tp.build_fwtrack() -# treat.sort() -# if len(options.tfile) > 1: -# # multiple input -# for tfile in options.tfile[1:]: -# tp = options.parser(tfile) -# treat = tp.append_fwtrack( treat ) -# treat.sort() - -# if options.cfile: -# options.info("#1.2 read input tags...") -# control = options.parser(options.cfile[0]).build_fwtrack() -# control.sort() -# if len(options.cfile) > 1: -# # multiple input -# for cfile in options.cfile[1:]: -# cp = options.parser(cfile) -# control = cp.append_fwtrack( control ) -# control.sort() -# else: -# control = None -# options.info("#1 tag size is determined as %d bps" % options.tsize) -# return (treat, control) diff --git a/MACS3/Commands/filterdup_cmd.py b/MACS3/Commands/filterdup_cmd.py index 156909f0..84fc1010 100644 --- a/MACS3/Commands/filterdup_cmd.py +++ b/MACS3/Commands/filterdup_cmd.py @@ -1,4 +1,4 @@ -# Time-stamp: <2020-11-24 16:49:34 Tao Liu> +# Time-stamp: <2024-10-02 16:48:17 Tao Liu> """Description: Filter duplicate reads depending on sequencing depth. @@ -17,40 +17,42 @@ # ------------------------------------ # own python modules # ------------------------------------ -from MACS3.Utilities.Constants import * +# from MACS3.Utilities.Constants import * from MACS3.Utilities.OptValidator import opt_validate_filterdup from MACS3.Signal.Prob import binomial_cdf_inv # ------------------------------------ # Main function # ------------------------------------ -def run( o_options ): + + +def run(o_options): """The Main function/pipeline for duplication filter. """ # Parse options... - options = opt_validate_filterdup( o_options ) + options = opt_validate_filterdup(o_options) # end of parsing commandline options info = options.info - warn = options.warn - debug = options.debug - error = options.error + # warn = options.warn + # debug = options.debug + # error = options.error - options.PE_MODE = options.format in ('BAMPE','BEDPE') + options.PE_MODE = options.format in ('BAMPE', 'BEDPE') if options.outputfile != "stdout": - outfhd = open( os.path.join( options.outdir, options.outputfile ) ,"w" ) + outfhd = open(os.path.join(options.outdir, options.outputfile), "w") else: outfhd = sys.stdout - #1 Read tag files + # 1 Read tag files if options.PE_MODE: info("# read input file in Paired-end mode.") - inputtrack = load_frag_files_options ( options ) # return PETrackI object - t0 = inputtrack.total # total fragments - info("# total fragments/pairs in alignment file: %d" % (t0) ) + inputtrack = load_frag_files_options(options) # return PETrackI object + t0 = inputtrack.total # total fragments + info("# total fragments/pairs in alignment file: %d" % (t0)) else: info("# read tag files...") - inputtrack = load_tag_files_options (options) + inputtrack = load_tag_files_options(options) info("# tag size = %d" % options.tsize) inputtrack.fw = options.tsize @@ -69,29 +71,30 @@ def run( o_options ): max_dup_tags = int(options.keepduplicates) info("filter out redundant tags at the same location and the same strand by allowing at most %d tag(s)" % (max_dup_tags)) - inputtrack.filter_dup( max_dup_tags ) + inputtrack.filter_dup(max_dup_tags) t1 = inputtrack.total info(" tags after filtering in alignment file: %d" % (t1)) info(" Redundant rate of alignment file: %.2f" % (float(t0-t1)/t0)) if not options.dryrun: - info( "Write to BED file" ) - inputtrack.print_to_bed( fhd=outfhd ) - info( "finished! Check %s." % options.outputfile ) + info("Write to BED file") + inputtrack.print_to_bed(fhd=outfhd) + info("finished! Check %s." % options.outputfile) else: - info( "Dry-run is finished!" ) + info("Dry-run is finished!") -def cal_max_dup_tags ( genome_size, tags_number, p=1e-5 ): +def cal_max_dup_tags(genome_size, tags_number, p=1e-5): """Calculate the maximum duplicated tag number based on genome size, total tag number and a p-value based on binomial distribution. Brute force algorithm to calculate reverse CDF no more than MAX_LAMBDA(100000). """ - return binomial_cdf_inv(1-p,tags_number,1.0/genome_size) + return binomial_cdf_inv(1-p, tags_number, 1.0/genome_size) + -def load_tag_files_options ( options ): +def load_tag_files_options(options): """From the options, load alignment tags. """ @@ -106,12 +109,13 @@ def load_tag_files_options ( options ): # multiple input for tfile in options.ifile[1:]: tp = options.parser(tfile, buffer_size=options.buffer_size) - treat = tp.append_fwtrack( treat ) - #treat.sort() + treat = tp.append_fwtrack(treat) + # treat.sort() treat.finalize() return treat -def load_frag_files_options ( options ): + +def load_frag_files_options(options): """From the options, load treatment fragments and control fragments (if available). """ @@ -122,7 +126,7 @@ def load_frag_files_options ( options ): if len(options.ifile) > 1: # multiple input for tfile in options.ifile[1:]: - tp = options.parser(ifile, buffer_size=options.buffer_size) - treat = tp.append_petrack( treat ) + tp = options.parser(tfile, buffer_size=options.buffer_size) + treat = tp.append_petrack(treat) treat.finalize() return treat diff --git a/MACS3/Commands/hmmratac_cmd.py b/MACS3/Commands/hmmratac_cmd.py index 9669b62a..b2345cef 100644 --- a/MACS3/Commands/hmmratac_cmd.py +++ b/MACS3/Commands/hmmratac_cmd.py @@ -1,4 +1,4 @@ -# Time-stamp: <2024-05-15 19:40:45 Tao Liu> +# Time-stamp: <2024-10-02 17:41:10 Tao Liu> """Description: Main HMMR command @@ -16,28 +16,26 @@ import sys import gc import numpy as np -import json -import csv import tempfile -from hmmlearn import hmm -#from typing import Sized # ------------------------------------ # own python modules # ------------------------------------ -from MACS3.Utilities.Constants import * +# from MACS3.Utilities.Constants import * from MACS3.Utilities.OptValidator import opt_validate_hmmratac from MACS3.IO.PeakIO import PeakIO -from MACS3.IO.Parser import BAMPEParser, BEDPEParser #BAMaccessor +from MACS3.IO.Parser import BAMPEParser, BEDPEParser # BAMaccessor from MACS3.Signal.HMMR_EM import HMMR_EM -from MACS3.Signal.HMMR_Signal_Processing import generate_weight_mapping, generate_digested_signals, extract_signals_from_regions -from MACS3.Signal.HMMR_HMM import hmm_training, hmm_predict, hmm_model_init, hmm_model_save +from MACS3.Signal.HMMR_Signal_Processing import (generate_weight_mapping, + generate_digested_signals, + extract_signals_from_regions) +from MACS3.Signal.HMMR_HMM import (hmm_training, + hmm_predict, + hmm_model_init, + hmm_model_save) from MACS3.Signal.Region import Regions -from MACS3.Signal.BedGraph import bedGraphTrackI from MACS3.IO.BedGraphIO import bedGraphIO -#from MACS3.IO.BED import BEDreader # this hasn't been implemented yet. - # ------------------------------------ # constants # ------------------------------------ @@ -49,36 +47,37 @@ # ------------------------------------ # Main function # ------------------------------------ -def run( args ): + + +def run(args): """The HMMRATAC function/pipeline for MACS. """ - options = opt_validate_hmmratac( args ) + options = opt_validate_hmmratac(args) ############################################# # 0. Names of output files ############################################# - short_bdgfile = os.path.join( options.outdir, options.name+"_digested_short.bdg" ) - mono_bdgfile = os.path.join( options.outdir, options.name+"_digested_mono.bdg" ) - di_bdgfile = os.path.join( options.outdir, options.name+"_digested_di.bdg" ) - tri_bdgfile = os.path.join( options.outdir, options.name+"_digested_tri.bdg" ) - - training_region_bedfile = os.path.join( options.outdir, options.name+"_training_regions.bed" ) - training_datafile = os.path.join( options.outdir, options.name+"_training_data.txt" ) - training_datalengthfile = os.path.join( options.outdir, options.name+"_training_lengths.txt" ) + short_bdgfile = os.path.join(options.outdir, options.name+"_digested_short.bdg") + mono_bdgfile = os.path.join(options.outdir, options.name+"_digested_mono.bdg") + di_bdgfile = os.path.join(options.outdir, options.name+"_digested_di.bdg") + tri_bdgfile = os.path.join(options.outdir, options.name+"_digested_tri.bdg") + training_region_bedfile = os.path.join(options.outdir, options.name+"_training_regions.bed") + training_datafile = os.path.join(options.outdir, options.name+"_training_data.txt") + training_datalengthfile = os.path.join(options.outdir, options.name+"_training_lengths.txt") - hmm_modelfile = os.path.join( options.outdir, options.name+"_model.json" ) + hmm_modelfile = os.path.join(options.outdir, options.name+"_model.json") - open_state_bdgfile = os.path.join( options.outdir, options.name+"_open.bdg" ) - nuc_state_bdgfile = os.path.join( options.outdir, options.name+"_nuc.bdg" ) - bg_state_bdgfile = os.path.join( options.outdir, options.name+"_bg.bdg" ) + open_state_bdgfile = os.path.join(options.outdir, options.name+"_open.bdg") + nuc_state_bdgfile = os.path.join(options.outdir, options.name+"_nuc.bdg") + bg_state_bdgfile = os.path.join(options.outdir, options.name+"_bg.bdg") - states_file = os.path.join( options.outdir, options.name+"_states.bed" ) + states_file = os.path.join(options.outdir, options.name+"_states.bed") - accessible_file = os.path.join( options.outdir, options.name+"_accessible_regions.narrowPeak" ) + accessible_file = os.path.join(options.outdir, options.name+"_accessible_regions.narrowPeak") + + cutoffanalysis_file = os.path.join(options.outdir, options.name+"_cutoff_analysis.tsv") - cutoffanalysis_file = os.path.join( options.outdir, options.name+"_cutoff_analysis.tsv" ) - ############################################# # 1. Read the input files ############################################# @@ -95,31 +94,31 @@ def run( args ): alignment = parser(options.input_file[0], buffer_size=options.buffer_size) petrack = alignment.build_petrack() - if len( options.input_file ) > 1: + if len(options.input_file) > 1: # multiple input for inputfile in options.input_file[1:]: alignment = parser(inputfile, buffer_size=options.buffer_size) - petrack = alignment.append_petrack( petrack ) + petrack = alignment.append_petrack(petrack) # remember to finalize the petrack petrack.finalize() # filter duplicates if needed if options.misc_keep_duplicates: - petrack.filter_dup( maxnum=1 ) + petrack.filter_dup(maxnum=1) - # read in blacklisted if option entered + # read in blacklisted if option entered if options.blacklist: options.info("# Read blacklist file...") - peakio = open( options.blacklist ) + peakio = open(options.blacklist) blacklist = PeakIO() i = 0 - for l in peakio: - fs = l.rstrip().split() + for l_p in peakio: + fs = l_p.rstrip().split() i += 1 - blacklist.add( fs[0].encode(), int(fs[1]), int(fs[2]), name=b"%d" % i ) + blacklist.add(fs[0].encode(), int(fs[1]), int(fs[2]), name=b"%d" % i) blacklist.sort() blacklist_regions = Regions() - blacklist_regions.init_from_PeakIO( blacklist ) + blacklist_regions.init_from_PeakIO(blacklist) ############################################# # 2. EM @@ -128,12 +127,12 @@ def run( args ): # Skip EM and use the options.em_means and options.em_stddevs em_means = options.em_means em_stddevs = options.em_stddevs - options.info( "#2 EM is skipped. The following means and stddevs will be used:" ) + options.info("#2 EM is skipped. The following means and stddevs will be used:") else: # we will use EM to get the best means/stddevs for the mono-, di- and tri- modes of fragment sizes options.info("#2 Use EM algorithm to estimate means and stddevs of fragment lengths") options.info("# for mono-, di-, and tri-nucleosomal signals...") - em_trainer = HMMR_EM( petrack, options.em_means[1:4], options.em_stddevs[1:4], seed = options.hmm_randomSeed ) + em_trainer = HMMR_EM(petrack, options.em_means[1:4], options.em_stddevs[1:4], seed=options.hmm_randomSeed) # the mean and stddev after EM training em_means = [options.em_means[0],] em_means.extend(em_trainer.fragMeans) @@ -141,65 +140,65 @@ def run( args ): em_stddevs.extend(em_trainer.fragStddevs) # we will round to 1 decimal digit for i in range(len(em_means)): - em_means[ i ] = round(em_means[ i ], 1) + em_means[i] = round(em_means[i], 1) for i in range(len(em_stddevs)): - em_stddevs[ i ] = round(em_stddevs[ i ], 1) - options.info( f"# The means and stddevs after EM:") + em_stddevs[i] = round(em_stddevs[i], 1) + options.info(f"# The means and stddevs after EM:") - options.info( "# {0[0]:>10s} {0[1]:>10s} {0[2]:>10s} {0[3]:>10s}".format( ["short", "mono", "di", "tri"] ) ) - options.info( "# means: {0[0]:>10.4g} {0[1]:>10.4g} {0[2]:>10.4g} {0[3]:>10.4g}".format( em_means ) ) - options.info( "# stddevs: {0[0]:>10.4g} {0[1]:>10.4g} {0[2]:>10.4g} {0[3]:>10.4g}".format( em_stddevs ) ) + options.info( "# {0[0]:>10s} {0[1]:>10s} {0[2]:>10s} {0[3]:>10s}".format(["short", "mono", "di", "tri"])) + options.info( "# means: {0[0]:>10.4g} {0[1]:>10.4g} {0[2]:>10.4g} {0[3]:>10.4g}".format(em_means)) + options.info( "# stddevs: {0[0]:>10.4g} {0[1]:>10.4g} {0[2]:>10.4g} {0[3]:>10.4g}".format(em_stddevs)) # to finalize the EM training, we will decompose ATAC-seq into four signal tracks - options.info( f"# Compute the weights for each fragment length for each of the four signal types") + options.info(f"# Compute the weights for each fragment length for each of the four signal types") fl_dict = petrack.count_fraglengths() fl_list = list(fl_dict.keys()) fl_list.sort() # now we will prepare the weights for each fragment length for # each of the four distributions based on the EM results - weight_mapping = generate_weight_mapping( fl_list, em_means, em_stddevs, min_frag_p = options.min_frag_p ) - - options.info( f"# Generate short, mono-, di-, and tri-nucleosomal signals") - digested_atac_signals = generate_digested_signals( petrack, weight_mapping ) + weight_mapping = generate_weight_mapping(fl_list, em_means, em_stddevs, min_frag_p=options.min_frag_p) + + options.info("# Generate short, mono-, di-, and tri-nucleosomal signals") + digested_atac_signals = generate_digested_signals(petrack, weight_mapping) # save three types of signals if needed if options.save_digested: - bdgshort = BedGraphIO( short_bdgfile, data = digested_atac_signals[ 0 ] ) + bdgshort = bedGraphIO(short_bdgfile, data=digested_atac_signals[0]) bdgshort.write_bedGraph("short","short") - bdgmono = BedGraphIO( mono_bdgfile, data = digested_atac_signals[ 1 ] ) + bdgmono = bedGraphIO(mono_bdgfile, data=digested_atac_signals[1]) bdgmono.write_bedGraph("mono", "mono") - bdgdi = BedGraphIO( di_bdgfile, data = digested_atac_signals[ 2 ] ) + bdgdi = bedGraphIO(di_bdgfile, data=digested_atac_signals[2]) bdgdi.write_bedGraph("di", "di") - bdgtri = BedGraphIO( tri_bdgfile, data = digested_atac_signals[ 3 ] ) + bdgtri = bedGraphIO(tri_bdgfile, data=digested_atac_signals[3]) bdgtri.write_bedGraph("tri", "tri") minlen = int(petrack.average_template_length) - # if options.pileup_short is on, we pile up only the short fragments to identify training + # if options.pileup_short is on, we pile up only the short fragments to identify training # regions and to prescan for candidate regions for decoding. if options.pileup_short: - options.info( f"# Pile up ONLY short fragments" ) - fc_bdg = digested_atac_signals[ 0 ] + options.info("# Pile up ONLY short fragments") + fc_bdg = digested_atac_signals[0] else: - options.info( f"# Pile up all fragments" ) - fc_bdg = petrack.pileup_bdg( [1.0,], baseline_value = 0 ) + options.info("# Pile up all fragments") + fc_bdg = petrack.pileup_bdg([1.0, ], baseline_value=0) (sum_v, n_v, max_v, min_v, mean_v, std_v) = fc_bdg.summary() - options.info( f"# Convert pileup to fold-change over average signal" ) + options.info("# Convert pileup to fold-change over average signal") fc_bdg.apply_func(lambda x: x/mean_v) # if cutoff_analysis only, generate and save the report and quit if options.cutoff_analysis_only: # we will run cutoff analysis only and quit - options.info( f"#3 Generate cutoff analysis report from {petrack.total} fragments") - options.info( f"# Please review the cutoff analysis result in {cutoffanalysis_file}" ) + options.info(f"#3 Generate cutoff analysis report from {petrack.total} fragments") + options.info(f"# Please review the cutoff analysis result in {cutoffanalysis_file}") # Let MACS3 do the cutoff analysis to help decide the lower and upper cutoffs with open(cutoffanalysis_file, "w") as ofhd_cutoff: - ofhd_cutoff.write( fc_bdg.cutoff_analysis( min_length=minlen, max_gap=options.hmm_training_flanking, max_score = options.cutoff_analysis_max, steps=options.cutoff_analysis_steps ) ) - #raise Exception("Cutoff analysis only.") + ofhd_cutoff.write(fc_bdg.cutoff_analysis(min_length=minlen, max_gap=options.hmm_training_flanking, max_score=options.cutoff_analysis_max, steps=options.cutoff_analysis_steps)) + # raise Exception("Cutoff analysis only.") sys.exit(1) ############################################# @@ -208,192 +207,192 @@ def run( args ): if options.hmm_file: # skip this step if hmm_file is given - options.info( f"#3 Skip this step of looking for training set since a Hidden Markov Model file has been provided!") - elif options.hmm_training_regions: + options.info("#3 Skip this step of looking for training set since a Hidden Markov Model file has been provided!") + elif options.hmm_training_regions: # if a training region file is provided - need to read in the bedfile and skip the peak calling step options.info(f"#3 Read training regions from BED file: {options.hmm_training_regions}") # from refinepeak_cmd.py: - peakio = open(options.hmm_training_regions,"rb") + peakio = open(options.hmm_training_regions, "rb") peaks = PeakIO() - for l in peakio: - fs = l.rstrip().split() - peaks.add( chromosome=fs[0], start=int(fs[1]), end=int(fs[2])) #change based on what expected input file should contain + for l_p in peakio: + fs = l_p.rstrip().split() + peaks.add(chromosome=fs[0], start=int(fs[1]), end=int(fs[2])) #change based on what expected input file should contain peakio.close() training_regions = Regions() - training_regions.init_from_PeakIO( peaks ) + training_regions.init_from_PeakIO(peaks) options.info("# Training regions have been read from bedfile") else: # Find regions with fold change within determined range to use as training sites. # Find regions with zscore values above certain cutoff to exclude from viterbi. - # - options.info( f"#3 Look for training set from {petrack.total} fragments" ) - options.info( f"# Call peak above within fold-change range of {options.hmm_lower} and {options.hmm_upper}." ) - options.info( f"# The minimum length of the region is set as the average template/fragment length in the dataset: {minlen}" ) - options.info( f"# The maximum gap to merge nearby significant regions is set as the flanking size to extend training regions: {options.hmm_training_flanking}" ) - peaks = fc_bdg.call_peaks (cutoff=options.hmm_lower, min_length=minlen, max_gap=options.hmm_training_flanking, call_summits=False) - options.info( f"# Total training regions called after applying the lower cutoff {options.hmm_lower}: {peaks.total}" ) - peaks.filter_score( options.hmm_lower, options.hmm_upper ) - options.info( f"# Total training regions after filtering with upper cutoff {options.hmm_upper}: {peaks.total}" ) - - options.info( f"# **IMPORTANT**") - options.info( f"# Please review the cutoff analysis result in {cutoffanalysis_file} to verify" ) - options.info( f"# if the choices of lower, upper and prescanning cutoff are appropriate." ) - options.info( f"# Please read the message in the section 'Choices of cutoff values' by running" ) - options.info( f"# `macs3 hmmratac -h` for detail." ) - options.info( f"# ****" ) - + # + options.info(f"#3 Look for training set from {petrack.total} fragments") + options.info(f"# Call peak above within fold-change range of {options.hmm_lower} and {options.hmm_upper}.") + options.info(f"# The minimum length of the region is set as the average template/fragment length in the dataset: {minlen}") + options.info(f"# The maximum gap to merge nearby significant regions is set as the flanking size to extend training regions: {options.hmm_training_flanking}") + peaks = fc_bdg.call_peaks(cutoff=options.hmm_lower, min_length=minlen, max_gap=options.hmm_training_flanking, call_summits=False) + options.info(f"# Total training regions called after applying the lower cutoff {options.hmm_lower}: {peaks.total}") + peaks.filter_score(options.hmm_lower, options.hmm_upper) + options.info(f"# Total training regions after filtering with upper cutoff {options.hmm_upper}: {peaks.total}") + + options.info( "# **IMPORTANT**") + options.info(f"# Please review the cutoff analysis result in {cutoffanalysis_file} to verify") + options.info( "# if the choices of lower, upper and prescanning cutoff are appropriate.") + options.info( "# Please read the message in the section 'Choices of cutoff values' by running") + options.info( "# `macs3 hmmratac -h` for detail.") + options.info( "# ****") + # Let MACS3 do the cutoff analysis to help decide the lower and upper cutoffs with open(cutoffanalysis_file, "w") as ofhd_cutoff: - ofhd_cutoff.write( fc_bdg.cutoff_analysis( min_length=minlen, max_gap=options.hmm_training_flanking, max_score = options.cutoff_analysis_max ) ) - + ofhd_cutoff.write(fc_bdg.cutoff_analysis(min_length=minlen, max_gap=options.hmm_training_flanking, max_score=options.cutoff_analysis_max)) + # we will check if anything left after filtering if peaks.total > options.hmm_maxTrain: - peaks = peaks.randomly_pick( options.hmm_maxTrain, seed = options.hmm_randomSeed ) - options.info( f"# We randomly pick {options.hmm_maxTrain} regions for training" ) + peaks = peaks.randomly_pick(options.hmm_maxTrain, seed=options.hmm_randomSeed) + options.info(f"# We randomly pick {options.hmm_maxTrain} regions for training") elif peaks.total == 0: - options.error( f"# No training regions found. Please adjust the lower or upper cutoff." ) + options.error("# No training regions found. Please adjust the lower or upper cutoff.") raise Exception("Not enough training regions!") - + # Now we convert PeakIO to Regions and filter blacklisted regions training_regions = Regions() - training_regions.init_from_PeakIO( peaks ) + training_regions.init_from_PeakIO(peaks) # We will expand the regions to both directions and merge overlap - options.info( f"# We expand the training regions with {options.hmm_training_flanking} basepairs and merge overlap" ) - training_regions.expand( options.hmm_training_flanking ) + options.info(f"# We expand the training regions with {options.hmm_training_flanking} basepairs and merge overlap") + training_regions.expand(options.hmm_training_flanking) training_regions.merge_overlap() - + # remove peaks overlapping with blacklisted regions if options.blacklist: - training_regions.exclude( blacklist_regions ) - options.info( f"# after removing those overlapping with provided blacklisted regions, we have {training_regions.total} left" ) + training_regions.exclude(blacklist_regions) + options.info(f"# after removing those overlapping with provided blacklisted regions, we have {training_regions.total} left") if options.save_train: - fhd = open( training_region_bedfile, "w" ) - training_regions.write_to_bed( fhd ) + fhd = open(training_region_bedfile, "w") + training_regions.write_to_bed(fhd) fhd.close() - options.info( f"# Training regions have been saved to `{options.name}_training_regions.bed` " ) - + options.info(f"# Training regions have been saved to `{options.name}_training_regions.bed` ") + ############################################# # 4. Train HMM ############################################# # if model file is provided, we skip this step # include options.hmm_type and make it backwards compatible, if no hmm_type default is gaussian if options.hmm_file: - options.info( f"#4 Load Hidden Markov Model from given model file") - hmm_model, i_open_region, i_background_region, i_nucleosomal_region, options.hmm_binsize, options.hmm_type = hmm_model_init( options.hmm_file ) + options.info("#4 Load Hidden Markov Model from given model file") + hmm_model, i_open_region, i_background_region, i_nucleosomal_region, options.hmm_binsize, options.hmm_type = hmm_model_init(options.hmm_file) else: - options.info( f"#4 Train Hidden Markov Model with Multivariate Gaussian Emission" ) + options.info("#4 Train Hidden Markov Model with Multivariate Gaussian Emission") # extract signals within peak using the given binsize - options.info( f"# Extract signals in training regions with bin size of {options.hmm_binsize}") - [ training_bins, training_data, training_data_lengths ] = extract_signals_from_regions( digested_atac_signals, training_regions, binsize = options.hmm_binsize, hmm_type = options.hmm_type ) + options.info(f"# Extract signals in training regions with bin size of {options.hmm_binsize}") + [training_bins, training_data, training_data_lengths] = extract_signals_from_regions(digested_atac_signals, training_regions, binsize=options.hmm_binsize, hmm_type=options.hmm_type) if options.save_train: - f = open( training_datafile, "w" ) - for i in range( len( training_data ) ): - v = training_data[ i ] - p = training_bins[ i ] - f.write( f"{p[0]}\t{p[1]}\t{v[0]}\t{v[1]}\t{v[2]}\t{v[3]}\n" ) + f = open(training_datafile, "w") + for i in range(len(training_data)): + v = training_data[i] + p = training_bins[i] + f.write(f"{p[0]}\t{p[1]}\t{v[0]}\t{v[1]}\t{v[2]}\t{v[3]}\n") f.close() - f = open( training_datalengthfile, "w" ) + f = open(training_datalengthfile, "w") for v in training_data_lengths: - f.write( f"{v}\n" ) + f.write(f"{v}\n") f.close() - options.info( f"# Use Baum-Welch algorithm to train the HMM") + options.info("# Use Baum-Welch algorithm to train the HMM") - hmm_model = hmm_training( training_data, training_data_lengths, random_seed = options.hmm_randomSeed, hmm_type = options.hmm_type ) + hmm_model = hmm_training(training_data, training_data_lengths, random_seed=options.hmm_randomSeed, hmm_type=options.hmm_type) - options.info( f"# HMM converged: {hmm_model.monitor_.converged}") + options.info(f"# HMM converged: {hmm_model.monitor_.converged}") # label hidden states if options.hmm_type == "gaussian": - means_sum = np.sum( hmm_model.means_, axis=1 ) + means_sum = np.sum(hmm_model.means_, axis=1) if options.hmm_type == "poisson": - means_sum = np.sum( hmm_model.lambdas_, axis=1 ) + means_sum = np.sum(hmm_model.lambdas_, axis=1) # first, the state with the highest overall emission is the open state - i_open_region = np.where( means_sum == max(means_sum) )[0][0] + i_open_region = np.where(means_sum == max(means_sum))[0][0] - # second, the state with lowest overall emission is the bg state - i_background_region = np.where( means_sum == min(means_sum) )[0][0] + # second, the state with lowest overall emission is the bg state + i_background_region = np.where(means_sum == min(means_sum))[0][0] # last one is the nuc state (note it may not be accurate though i_nucleosomal_region = list(set([0, 1, 2]) - set([i_open_region, i_background_region]))[0] # write hmm into model file - options.info( f"# Write HMM parameters into JSON: {hmm_modelfile}") - hmm_model_save( hmm_modelfile, hmm_model, options.hmm_binsize, i_open_region, i_nucleosomal_region, i_background_region, options.hmm_type ) - - # if --modelonly option provided, exit script after hmm model is saved + options.info(f"# Write HMM parameters into JSON: {hmm_modelfile}") + hmm_model_save(hmm_modelfile, hmm_model, options.hmm_binsize, i_open_region, i_nucleosomal_region, i_background_region, options.hmm_type) + + # if --modelonly option provided, exit script after hmm model is saved if options.hmm_modelonly: - options.info( f"# Complete - HMM model was saved, program exited (--modelonly option was provided) ") + options.info("# Complete - HMM model was saved, program exited (--modelonly option was provided) ") sys.exit() # Now tell users the parameters of the HMM - assignments = [ "", "", "" ] - assignments[ i_open_region ] = "open" - assignments[ i_nucleosomal_region ] = "nuc" - assignments[ i_background_region ] = "bg" - - options.info( f"# The Hidden Markov Model for signals of binsize of {options.hmm_binsize} basepairs:") - options.info( f"# open state index: state{i_open_region}" ) - options.info( f"# nucleosomal state index: state{i_nucleosomal_region}" ) - options.info( f"# background state index: state{i_background_region}" ) - options.info( f"# Starting probabilities of states:") - options.info( "# {0[0]:>10s} {0[1]:>10s} {0[2]:>10s}".format( assignments ) ) - options.info( "# {0[0]:>10.4g} {0[1]:>10.4g} {0[2]:>10.4g}".format( hmm_model.startprob_ ) ) - options.info( f"# HMM Transition probabilities:") - options.info( "# {0[0]:>10s} {0[1]:>10s} {0[2]:>10s}".format( assignments ) ) - options.info( "# {0:>10s}-> {1[0]:>10.4g} {1[1]:>10.4g} {1[2]:>10.4g}".format(assignments[0], hmm_model.transmat_[0]) ) - options.info( "# {0:>10s}-> {1[0]:>10.4g} {1[1]:>10.4g} {1[2]:>10.4g}".format(assignments[1], hmm_model.transmat_[1]) ) - options.info( "# {0:>10s}-> {1[0]:>10.4g} {1[1]:>10.4g} {1[2]:>10.4g}".format(assignments[2], hmm_model.transmat_[2]) ) + assignments = ["", "", ""] + assignments[i_open_region] = "open" + assignments[i_nucleosomal_region] = "nuc" + assignments[i_background_region] = "bg" + options.info(f"# The Hidden Markov Model for signals of binsize of {options.hmm_binsize} basepairs:") + options.info(f"# open state index: state{i_open_region}") + options.info(f"# nucleosomal state index: state{i_nucleosomal_region}") + options.info(f"# background state index: state{i_background_region}") + options.info( "# Starting probabilities of states:") + options.info( "# {0[0]:>10s} {0[1]:>10s} {0[2]:>10s}".format(assignments)) + options.info( "# {0[0]:>10.4g} {0[1]:>10.4g} {0[2]:>10.4g}".format(hmm_model.startprob_)) + options.info( "# HMM Transition probabilities:") + options.info( "# {0[0]:>10s} {0[1]:>10s} {0[2]:>10s}".format(assignments)) + options.info( "# {0:>10s}-> {1[0]:>10.4g} {1[1]:>10.4g} {1[2]:>10.4g}".format(assignments[0], hmm_model.transmat_[0])) + options.info( "# {0:>10s}-> {1[0]:>10.4g} {1[1]:>10.4g} {1[2]:>10.4g}".format(assignments[1], hmm_model.transmat_[1])) + options.info( "# {0:>10s}-> {1[0]:>10.4g} {1[1]:>10.4g} {1[2]:>10.4g}".format(assignments[2], hmm_model.transmat_[2])) + if options.hmm_type == 'gaussian': - options.info( f"# HMM Emissions (means): ") - options.info( "# {0[0]:>10s} {0[1]:>10s} {0[2]:>10s} {0[3]:>10s}".format( ["short", "mono", "di", "tri"] ) ) - options.info( "# {0:>10s}: {1[0]:>10.4g} {1[1]:>10.4g} {1[2]:>10.4g} {1[3]:>10.4g}".format(assignments[0], hmm_model.means_[0]) ) - options.info( "# {0:>10s}: {1[0]:>10.4g} {1[1]:>10.4g} {1[2]:>10.4g} {1[3]:>10.4g}".format(assignments[1], hmm_model.means_[1]) ) - options.info( "# {0:>10s}: {1[0]:>10.4g} {1[1]:>10.4g} {1[2]:>10.4g} {1[3]:>10.4g}".format(assignments[2], hmm_model.means_[2]) ) + options.info("# HMM Emissions (means): ") + options.info( "# {0[0]:>10s} {0[1]:>10s} {0[2]:>10s} {0[3]:>10s}".format(["short", "mono", "di", "tri"])) + options.info( "# {0:>10s}: {1[0]:>10.4g} {1[1]:>10.4g} {1[2]:>10.4g} {1[3]:>10.4g}".format(assignments[0], hmm_model.means_[0])) + options.info( "# {0:>10s}: {1[0]:>10.4g} {1[1]:>10.4g} {1[2]:>10.4g} {1[3]:>10.4g}".format(assignments[1], hmm_model.means_[1])) + options.info( "# {0:>10s}: {1[0]:>10.4g} {1[1]:>10.4g} {1[2]:>10.4g} {1[3]:>10.4g}".format(assignments[2], hmm_model.means_[2])) if options.hmm_type == 'poisson': - options.info( f"# HMM Emissions (lambdas): ") - options.info( "# {0[0]:>10s} {0[1]:>10s} {0[2]:>10s} {0[3]:>10s}".format( ["short", "mono", "di", "tri"] ) ) - options.info( "# {0:>10s}: {1[0]:>10.4g} {1[1]:>10.4g} {1[2]:>10.4g} {1[3]:>10.4g}".format(assignments[0], hmm_model.lambdas_[0]) ) - options.info( "# {0:>10s}: {1[0]:>10.4g} {1[1]:>10.4g} {1[2]:>10.4g} {1[3]:>10.4g}".format(assignments[1], hmm_model.lambdas_[1]) ) - options.info( "# {0:>10s}: {1[0]:>10.4g} {1[1]:>10.4g} {1[2]:>10.4g} {1[3]:>10.4g}".format(assignments[2], hmm_model.lambdas_[2]) ) + options.info( "# HMM Emissions (lambdas): ") + options.info( "# {0[0]:>10s} {0[1]:>10s} {0[2]:>10s} {0[3]:>10s}".format(["short", "mono", "di", "tri"])) + options.info( "# {0:>10s}: {1[0]:>10.4g} {1[1]:>10.4g} {1[2]:>10.4g} {1[3]:>10.4g}".format(assignments[0], hmm_model.lambdas_[0])) + options.info( "# {0:>10s}: {1[0]:>10.4g} {1[1]:>10.4g} {1[2]:>10.4g} {1[3]:>10.4g}".format(assignments[1], hmm_model.lambdas_[1])) + options.info( "# {0:>10s}: {1[0]:>10.4g} {1[1]:>10.4g} {1[2]:>10.4g} {1[3]:>10.4g}".format(assignments[2], hmm_model.lambdas_[2])) ############################################# # 5. Predict ############################################# # Our prediction strategy will be different with HMMRATAC, we will first ask MACS call peaks with loose cutoff, then for each peak we will run HMM prediction to figure out labels. And for the rest of genomic regions, just mark them as 'background'. - options.info( f"#5 Decode with Viterbi to predict states" ) + options.info("#5 Decode with Viterbi to predict states") # the following /4 is totally arbitrary, we may need to fix it - candidate_peaks = fc_bdg.call_peaks (cutoff=options.prescan_cutoff, min_length=minlen, max_gap=options.hmm_training_flanking, call_summits=False) - options.info( f"#5 Total candidate peaks : {candidate_peaks.total}" ) + candidate_peaks = fc_bdg.call_peaks(cutoff=options.prescan_cutoff, min_length=minlen, max_gap=options.hmm_training_flanking, call_summits=False) + options.info(f"#5 Total candidate peaks : {candidate_peaks.total}") # Now we convert PeakIO to Regions and filter blacklisted regions candidate_regions = Regions() - candidate_regions.init_from_PeakIO( candidate_peaks ) + candidate_regions.init_from_PeakIO(candidate_peaks) # We will expand the regions to both directions and merge overlap - options.info( f"# We expand the candidate regions with {options.hmm_training_flanking} and merge overlap" ) - candidate_regions.expand( options.hmm_training_flanking ) + options.info(f"# We expand the candidate regions with {options.hmm_training_flanking} and merge overlap") + candidate_regions.expand(options.hmm_training_flanking) candidate_regions.merge_overlap() - options.info( f"# after expanding and merging, we have {candidate_regions.total} candidate regions" ) - + options.info(f"# after expanding and merging, we have {candidate_regions.total} candidate regions") + # remove peaks overlapping with blacklisted regions if options.blacklist: - candidate_regions.exclude( blacklist_regions ) - options.info( f"# after removing those overlapping with provided blacklisted regions, we have {candidate_regions.total} left" ) + candidate_regions.exclude(blacklist_regions) + options.info(f"# after removing those overlapping with provided blacklisted regions, we have {candidate_regions.total} left") # extract signals - options.info( f"# Extract signals in candidate regions and decode with HMM") + options.info("# Extract signals in candidate regions and decode with HMM") # we will do the extraction and prediction in a step of 10000 regions by default - + # Note: we can implement in a different way to extract then predict for each candidate region. - # predicted_results = hmm_decode_each_region ( digested_atac_signals, candidate_regions, hmm_model, binsize = options.hmm_binsize ) + # predicted_results = hmm_decode_each_region (digested_atac_signals, candidate_regions, hmm_model, binsize = options.hmm_binsize) # Note: we implement in a way that we will decode the candidate regions 10000 regions at a time so 1. we can make it running in parallel in the future; 2. we can reduce the memory usage. - options.info( f"# Use HMM to predict states") + options.info("# Use HMM to predict states") n = 0 # we create a temporary file to save the proba predicted from hmm @@ -406,8 +405,8 @@ def run( args ): options.info("# decoding %d..." % (n * options.decoding_steps)) # then extrac data from digested signals, create cr_bins, cr_data, and cr_data_lengths - [cr_bins, cr_data, cr_data_lengths] = extract_signals_from_regions( digested_atac_signals, cr, binsize = options.hmm_binsize, hmm_type = options.hmm_type ) - #options.debug( "# extract_signals_from_regions complete") + [cr_bins, cr_data, cr_data_lengths] = extract_signals_from_regions(digested_atac_signals, cr, binsize=options.hmm_binsize, hmm_type=options.hmm_type) + # options.debug("# extract_signals_from_regions complete") prob_data = hmm_predict(cr_data, cr_data_lengths, hmm_model) assert len(prob_data) == len(cr_bins) @@ -421,60 +420,61 @@ def run( args ): prob_data = [] gc.collect() - predicted_proba_file.seek(0) # reset - options.info( f"# predicted_proba files written...") + predicted_proba_file.seek(0) # reset + options.info("# predicted_proba files written...") ############################################# # 6. Output - add to OutputWriter ############################################# - options.info( f"# Write the output...") + options.info("# Write the output...") # Now taken the candidate_bins and predicted_proba, we can generate various # outputs - + # One thing to remember about candidate_bins is that the position # in this array is the 'end' of the bin, the actual region is the # 'end'-'binsize' to the 'end'. - + # First, the likelihoods for each of the three states in a bedGraph if options.save_likelihoods: - options.info( f"# Write the likelihoods for each states into three bedGraph files {options.name}_open.bdg, {options.name}_nuc.bdg, and {options.name}_bg.bdg") - open_state_bdg_fhd = open( open_state_bdgfile, "w" ) - nuc_state_bdg_fhd = open( nuc_state_bdgfile, "w" ) - bg_state_bdg_fhd = open( bg_state_bdgfile, "w" ) - save_proba_to_bedGraph( predicted_proba_file, options.hmm_binsize, open_state_bdg_fhd, nuc_state_bdg_fhd, bg_state_bdg_fhd, i_open_region, i_nucleosomal_region, i_background_region ) - predicted_proba_file.seek(0) # reset + options.info(f"# Write the likelihoods for each states into three bedGraph files {options.name}_open.bdg, {options.name}_nuc.bdg, and {options.name}_bg.bdg") + open_state_bdg_fhd = open(open_state_bdgfile, "w") + nuc_state_bdg_fhd = open(nuc_state_bdgfile, "w") + bg_state_bdg_fhd = open(bg_state_bdgfile, "w") + save_proba_to_bedGraph(predicted_proba_file, options.hmm_binsize, open_state_bdg_fhd, nuc_state_bdg_fhd, bg_state_bdg_fhd, i_open_region, i_nucleosomal_region, i_background_region) + predicted_proba_file.seek(0) # reset open_state_bdg_fhd.close() nuc_state_bdg_fhd.close() bg_state_bdg_fhd.close() - options.info( f"# finished writing proba_to_bedgraph") - + options.info("# finished writing proba_to_bedgraph") + # # Generate states path: - states_path = generate_states_path( predicted_proba_file, options.hmm_binsize, i_open_region, i_nucleosomal_region, i_background_region ) - options.info( f"# finished generating states path") + states_path = generate_states_path(predicted_proba_file, options.hmm_binsize, i_open_region, i_nucleosomal_region, i_background_region) + options.info("# finished generating states path") predicted_proba_file.close() #kill the temp file # Save states path if needed # PS: we need to implement extra feature to include those regions NOT in candidate_bins and assign them as 'background state'. if options.save_states: - options.info( f"# Write states assignments in a BED file: {options.name}_states.bed" ) - with open( states_file, "w" ) as f: - save_states_bed( states_path, f ) + options.info(f"# Write states assignments in a BED file: {options.name}_states.bed") + with open(states_file, "w") as f: + save_states_bed(states_path, f) - options.info( f"# Write accessible regions in a narrowPeak file: {options.name}_accessible_regions.narrowPeak") - with open( accessible_file, "w" ) as ofhd: - save_accessible_regions( states_path, ofhd, options.openregion_minlen, fc_bdg ) + options.info(f"# Write accessible regions in a narrowPeak file: {options.name}_accessible_regions.narrowPeak") + with open(accessible_file, "w") as ofhd: + save_accessible_regions(states_path, ofhd, options.openregion_minlen, fc_bdg) - options.info( f"# Finished") + options.info("# Finished") -def save_proba_to_bedGraph( predicted_proba_file, binsize, open_state_bdg_file, nuc_state_bdg_file, bg_state_bdg_file, i_open, i_nuc, i_bg ): - open_state_bdg_file = bedGraphIO( open_state_bdg_file ) - nuc_state_bdg_file = bedGraphIO( nuc_state_bdg_file ) - bg_state_bdg_file = bedGraphIO( bg_state_bdg_file ) +def save_proba_to_bedGraph(predicted_proba_file, binsize, open_state_bdg_file, nuc_state_bdg_file, bg_state_bdg_file, i_open, i_nuc, i_bg): + + open_state_bdg_file = bedGraphIO(open_state_bdg_file) + nuc_state_bdg_file = bedGraphIO(nuc_state_bdg_file) + bg_state_bdg_file = bedGraphIO(bg_state_bdg_file) open_state_bdg = open_state_bdg_file.data nuc_state_bdg = nuc_state_bdg_file.data bg_state_bdg = bg_state_bdg_file.data - + prev_chrom_name = None prev_bin_end = None @@ -492,35 +492,37 @@ def save_proba_to_bedGraph( predicted_proba_file, binsize, open_state_bdg_file, # we start a new chromosome if start_pos > 0: # add the first unannotated region as background - open_state_bdg.add_loc( chrname, 0, start_pos, 0.0 ) - nuc_state_bdg.add_loc( chrname, 0, start_pos, 0.0 ) - bg_state_bdg.add_loc( chrname, 0, start_pos, 1.0 ) + open_state_bdg.add_loc(chrname, 0, start_pos, 0.0) + nuc_state_bdg.add_loc(chrname, 0, start_pos, 0.0) + bg_state_bdg.add_loc(chrname, 0, start_pos, 1.0) prev_chrom_name = chrname else: # now check if the prev_bin_end is start_pos, if not, add a gap of background if prev_bin_end < start_pos: - open_state_bdg.add_loc( chrname, prev_bin_end, start_pos, 0.0 ) - nuc_state_bdg.add_loc( chrname, prev_bin_end, start_pos, 0.0 ) - bg_state_bdg.add_loc( chrname, prev_bin_end, start_pos, 1.0 ) + open_state_bdg.add_loc(chrname, prev_bin_end, start_pos, 0.0) + nuc_state_bdg.add_loc(chrname, prev_bin_end, start_pos, 0.0) + bg_state_bdg.add_loc(chrname, prev_bin_end, start_pos, 1.0) - open_state_bdg.add_loc( chrname, start_pos, end_pos, pp_open ) - nuc_state_bdg.add_loc( chrname, start_pos, end_pos, pp_nuc ) - bg_state_bdg.add_loc( chrname, start_pos, end_pos, pp_bg ) + open_state_bdg.add_loc(chrname, start_pos, end_pos, pp_open) + nuc_state_bdg.add_loc(chrname, start_pos, end_pos, pp_nuc) + bg_state_bdg.add_loc(chrname, start_pos, end_pos, pp_bg) prev_bin_end = end_pos - - open_state_bdg_file.write_bedGraph( "Open States", "Likelihoods of being Open States", trackline = False ) - nuc_state_bdg_file.write_bedGraph( "Nucleosomal States", "Likelihoods of being Nucleosomal States", trackline = False ) - bg_state_bdg_file.write_bedGraph( "Background States", "Likelihoods of being Background States", trackline = False ) + + open_state_bdg_file.write_bedGraph("Open States", "Likelihoods of being Open States", trackline=False) + nuc_state_bdg_file.write_bedGraph("Nucleosomal States", "Likelihoods of being Nucleosomal States", trackline=False) + bg_state_bdg_file.write_bedGraph("Background States", "Likelihoods of being Background States", trackline=False) return -def save_states_bed( states_path, states_bedfile ): - # we do not need to output background state. - for l in range( len( states_path ) ): - if states_path[l][3] != "bg": - states_bedfile.write( "%s\t" % states_path[l][0].decode() ) - states_bedfile.write( "%d\t%d\t%s\n" % states_path[l][1:] ) + +def save_states_bed(states_path, states_bedfile): + # we do not need to output background state. + for l_len in range(len(states_path)): + if states_path[l_len][3] != "bg": + states_bedfile.write("%s\t" % states_path[l_len][0].decode()) + states_bedfile.write("%d\t%d\t%s\n" % states_path[l_len][1:]) return + def generate_states_path(predicted_proba_file, binsize, i_open, i_nuc, i_bg): # predicted_proba_file is a temporary file ret_states_path = [] @@ -529,7 +531,7 @@ def generate_states_path(predicted_proba_file, binsize, i_open, i_nuc, i_bg): prev_chrom_name = None prev_bin_end = None prev_label = None - + for pp_line in predicted_proba_file: pp_data = pp_line.strip().split(b',') @@ -542,7 +544,7 @@ def generate_states_path(predicted_proba_file, binsize, i_open, i_nuc, i_bg): # find the best state as label label = labels_list[max((pp_open, 0), (pp_nuc, 1), (pp_bg, 2), key=lambda x: x[0])[1]] - + if chrname != prev_chrom_name: # we start a new chromosome if start_pos > 0: @@ -566,6 +568,7 @@ def generate_states_path(predicted_proba_file, binsize, i_open, i_nuc, i_bg): prev_bin_end = end_pos return ret_states_path + def save_accessible_regions(states_path, accessible_region_file, openregion_minlen, bdgscore): # Function to add regions to the list def add_regions(i, regions): @@ -580,18 +583,18 @@ def add_regions(i, regions): for i in range(len(states_path)-2): if (states_path[i][3] == 'nuc' and states_path[i+1][3] == 'open' and states_path[i+2][3] == 'nuc' and states_path[i][2] == states_path[i+1][1] and states_path[i+1][2] == states_path[i+2][1] and - states_path[i+2][2] - states_path[i][1] > openregion_minlen): # require nuc-open-nuc entire region start/endpos > openregion_minlen + states_path[i+2][2] - states_path[i][1] > openregion_minlen): # require nuc-open-nuc entire region start/endpos > openregion_minlen accessible_regions = add_regions(i, accessible_regions) - - # remove 'nuc' regions: + + # remove 'nuc' regions: accessible_regions = [tup for tup in accessible_regions if tup[3] != 'nuc'] # Generate broadpeak object openpeak = PeakIO() for region in accessible_regions[:-1]: - openpeak.add(chromosome=region[0], start=region[1], end=region[2]) + openpeak.add(chromosome=region[0], start=region[1], end=region[2]) # refine peak summit and score using bedGraphTrackI with scores - openpeak = bdgscore.refine_peaks( openpeak ) + openpeak = bdgscore.refine_peaks(openpeak) openpeak.write_to_narrowPeak(accessible_region_file) return diff --git a/MACS3/Commands/pileup_cmd.py b/MACS3/Commands/pileup_cmd.py index 9890ebdb..933100b9 100644 --- a/MACS3/Commands/pileup_cmd.py +++ b/MACS3/Commands/pileup_cmd.py @@ -4,55 +4,57 @@ under the terms of the BSD License (see the file LICENSE included with the distribution). """ -# Time-stamp: <2020-11-24 16:50:16 Tao Liu> +# Time-stamp: <2024-10-02 16:51:15 Tao Liu> # ------------------------------------ # python modules # ------------------------------------ import os -import sys # ------------------------------------ # own python modules # ------------------------------------ -from MACS3.Utilities.Constants import * +# from MACS3.Utilities.Constants import * from MACS3.Utilities.OptValidator import opt_validate_pileup from MACS3.Signal.Pileup import pileup_and_write_se, pileup_and_write_pe # ------------------------------------ # Main function # ------------------------------------ -def run( o_options ): + + +def run(o_options): """The Main function/pipeline for duplication filter. """ # Parse options... - options = opt_validate_pileup( o_options ) + options = opt_validate_pileup(o_options) # end of parsing commandline options info = options.info - warn = options.warn - debug = options.debug - error = options.error - #0 output arguments - options.PE_MODE = options.format in ('BAMPE','BEDPE') - - #0 prepare output file - outfile = os.path.join( options.outdir, options.outputfile ).encode() - if os.path.isfile( outfile ): - info("# Existing file %s will be replaced!" % outfile ) - os.unlink( outfile ) - - #1 Read tag files + # warn = options.warn + # debug = options.debug + # error = options.error + + # 0 output arguments + options.PE_MODE = options.format in ('BAMPE', 'BEDPE') + + # 0 prepare output file + outfile = os.path.join(options.outdir, options.outputfile).encode() + if os.path.isfile(outfile): + info("# Existing file %s will be replaced!" % outfile) + os.unlink(outfile) + + # 1 Read tag files info("# read alignment files...") if options.PE_MODE: info("# read input file in Paired-end mode.") - treat = load_frag_files_options ( options ) # return PETrackI object - t0 = treat.total # total fragments - info("# total fragments/pairs in alignment file: %d" % (t0) ) + treat = load_frag_files_options(options) # return PETrackI object + t0 = treat.total # total fragments + info("# total fragments/pairs in alignment file: %d" % (t0)) info("# Pileup paired-end alignment file.") - pileup_and_write_pe(treat, outfile ) + pileup_and_write_pe(treat, outfile) else: - (tsize, treat) = load_tag_files_options (options) + (tsize, treat) = load_tag_files_options(options) info("# tag size = %d", tsize) @@ -68,7 +70,8 @@ def run( o_options ): info("# Done! Check %s" % options.outputfile) -def load_tag_files_options ( options ): + +def load_tag_files_options(options): """From the options, load alignment tags. """ @@ -76,19 +79,20 @@ def load_tag_files_options ( options ): tp = options.parser(options.ifile[0], buffer_size=options.buffer_size) tsize = tp.tsize() treat = tp.build_fwtrack() - #treat.sort() + # treat.sort() if len(options.ifile) > 1: # multiple input for tfile in options.ifile[1:]: tp = options.parser(tfile, buffer_size=options.buffer_size) - treat = tp.append_fwtrack( treat ) - #treat.sort() + treat = tp.append_fwtrack(treat) + # treat.sort() treat.finalize() options.info("tag size is determined as %d bps" % tsize) return (tsize, treat) -def load_frag_files_options ( options ): + +def load_frag_files_options(options): """From the options, load treatment fragments and control fragments (if available). """ @@ -96,12 +100,12 @@ def load_frag_files_options ( options ): tp = options.parser(options.ifile[0], buffer_size=options.buffer_size) treat = tp.build_petrack() - #treat.sort() + # treat.sort() if len(options.ifile) > 1: # multiple input for tfile in options.ifile[1:]: tp = options.parser(tfile, buffer_size=options.buffer_size) - treat = tp.append_petrack( treat ) - #treat.sort() + treat = tp.append_petrack(treat) + # treat.sort() treat.finalize() return treat diff --git a/MACS3/Commands/predictd_cmd.py b/MACS3/Commands/predictd_cmd.py index 4cf792ec..d3781bc1 100644 --- a/MACS3/Commands/predictd_cmd.py +++ b/MACS3/Commands/predictd_cmd.py @@ -1,4 +1,4 @@ -# Time-stamp: <2020-11-24 16:59:33 Tao Liu> +# Time-stamp: <2024-10-02 16:53:35 Tao Liu> """Description: predict fragment size. @@ -11,49 +11,48 @@ # python modules # ------------------------------------ -import os -import sys - # ------------------------------------ # own python modules # ------------------------------------ -from MACS3.Utilities.Constants import * +from MACS3.Utilities.Constants import MAX_PAIRNUM from MACS3.Utilities.OptValidator import opt_validate_predictd -from MACS3.Signal.PeakModel import PeakModel,NotEnoughPairsException -from MACS3.Signal.Prob import binomial_cdf_inv +from MACS3.Signal.PeakModel import PeakModel, NotEnoughPairsException +# from MACS3.Signal.Prob import binomial_cdf_inv from MACS3.IO.OutputWriter import model2r_script # ------------------------------------ # Main function # ------------------------------------ -def run( o_options ): + + +def run(o_options): """The Main function/pipeline for duplication filter. """ # Parse options... - options = opt_validate_predictd( o_options ) + options = opt_validate_predictd(o_options) # end of parsing commandline options info = options.info warn = options.warn debug = options.debug - error = options.error - #0 output arguments - options.PE_MODE = options.format in ('BAMPE','BEDPE') + # error = options.error + # 0 output arguments + options.PE_MODE = options.format in ('BAMPE', 'BEDPE') - #1 Read tag files + # 1 Read tag files if options.PE_MODE: info("# read input file in Paired-end mode.") - treat = load_frag_files_options ( options ) # return PETrackI object + treat = load_frag_files_options(options) # return PETrackI object t0 = treat.total - info("# total fragments/pairs in alignment file: %d" % (t0) ) + info("# total fragments/pairs in alignment file: %d" % (t0)) else: info("# read alignment files...") - treat = load_tag_files_options (options) + treat = load_tag_files_options(options) t0 = treat.total info("# tag size = %d" % options.tsize) treat.fw = options.tsize info("# total tags in alignment file: %d", t0) - #2 Build Model + # 2 Build Model info("# Build Peak Model...") if options.PE_MODE: d = treat.average_template_length @@ -61,9 +60,9 @@ def run( o_options ): return try: - peakmodel = PeakModel(treatment = treat, - max_pairnum = MAX_PAIRNUM, - opt = options + peakmodel = PeakModel(treatment=treat, + max_pairnum=MAX_PAIRNUM, + opt=options ) peakmodel.build() info("# finished!") @@ -71,15 +70,16 @@ def run( o_options ): debug("# min_tags: %d" % (peakmodel.min_tags)) debug("# d: %d" % (peakmodel.d)) info("# predicted fragment length is %d bps" % peakmodel.d) - info("# alternative fragment length(s) may be %s bps" % ','.join(map(str,peakmodel.alternative_d))) + info("# alternative fragment length(s) may be %s bps" % ','.join(map(str, peakmodel.alternative_d))) info("# Generate R script for model : %s" % (options.modelR)) - model2r_script(peakmodel,options.modelR, options.rfile ) + model2r_script(peakmodel, options.modelR, options.rfile) options.d = peakmodel.d except NotEnoughPairsException: warn("# Can't find enough pairs of symmetric peaks to build model!") -def load_tag_files_options ( options ): + +def load_tag_files_options(options): """From the options, load alignment tags. """ @@ -89,19 +89,20 @@ def load_tag_files_options ( options ): ttsize = tp.tsize() options.tsize = ttsize treat = tp.build_fwtrack() - #treat.sort() + # treat.sort() if len(options.ifile) > 1: # multiple input for tfile in options.ifile[1:]: tp = options.parser(tfile, buffer_size=options.buffer_size) - treat = tp.append_fwtrack( treat ) - #treat.sort() + treat = tp.append_fwtrack(treat) + # treat.sort() treat.finalize() options.info("tag size is determined as %d bps" % options.tsize) return treat -def load_frag_files_options ( options ): + +def load_frag_files_options(options): """From the options, load treatment fragments and control fragments (if available). """ @@ -112,7 +113,7 @@ def load_frag_files_options ( options ): if len(options.ifile) > 1: # multiple input for tfile in options.ifile[1:]: - tp = options.parser(ifile, buffer_size=options.buffer_size) - treat = tp.append_petrack( treat ) + tp = options.parser(tfile, buffer_size=options.buffer_size) + treat = tp.append_petrack(treat) treat.finalize() return treat diff --git a/MACS3/Commands/randsample_cmd.py b/MACS3/Commands/randsample_cmd.py index 4ee88bc3..043f1e8d 100644 --- a/MACS3/Commands/randsample_cmd.py +++ b/MACS3/Commands/randsample_cmd.py @@ -1,4 +1,4 @@ -# Time-stamp: <2020-11-24 17:00:16 Tao Liu> +# Time-stamp: <2024-10-02 16:55:48 Tao Liu> """Description: Random sample certain number/percentage of tags. @@ -17,37 +17,39 @@ # ------------------------------------ # own python modules # ------------------------------------ -from MACS3.Utilities.Constants import * +# from MACS3.Utilities.Constants import * from MACS3.Utilities.OptValidator import opt_validate_randsample # ------------------------------------ # Main function # ------------------------------------ -def run( options0 ): - options = opt_validate_randsample( options0 ) + + +def run(options0): + options = opt_validate_randsample(options0) # end of parsing commandline options info = options.info - warn = options.warn - debug = options.debug + # warn = options.warn + # debug = options.debug error = options.error - options.PE_MODE = options.format in ('BAMPE','BEDPE') + options.PE_MODE = options.format in ('BAMPE', 'BEDPE') - #0 check output file + # 0 check output file if options.outputfile: - outfhd = open( os.path.join( options.outdir, options.outputfile ), "w" ) + outfhd = open(os.path.join(options.outdir, options.outputfile), "w") else: outfhd = sys.stdout - #1 Read tag files + # 1 Read tag files if options.PE_MODE: info("# read input file in Paired-end mode.") - treat = load_frag_files_options ( options ) # return PETrackI object - t0 = treat.total # total fragments - info("# total fragments/pairs in alignment file: %d" % (t0) ) + treat = load_frag_files_options(options) # return PETrackI object + t0 = treat.total # total fragments + info("# total fragments/pairs in alignment file: %d" % (t0)) else: info("read tag files...") - treat = load_tag_files_options (options) + treat = load_tag_files_options(options) info("tag size = %d" % options.tsize) treat.fw = options.tsize @@ -65,9 +67,9 @@ def run( options0 ): info(" Percentage of tags you want to keep: %.2f%%" % (options.percentage)) if options.seed >= 0: - info(" Random seed has been set as: %d" % options.seed ) + info(" Random seed has been set as: %d" % options.seed) - treat.sample_percent(options.percentage/100.0, options.seed ) + treat.sample_percent(options.percentage/100.0, options.seed) info(" tags after random sampling in alignment file: %d" % (treat.total)) @@ -75,7 +77,8 @@ def run( options0 ): treat.print_to_bed(fhd=outfhd) info("finished! Check %s." % options.outputfile) -def load_tag_files_options ( options ): + +def load_tag_files_options(options): """From the options, load alignment tags. """ @@ -85,19 +88,20 @@ def load_tag_files_options ( options ): ttsize = tp.tsize() options.tsize = ttsize treat = tp.build_fwtrack() - #treat.sort() + # treat.sort() if len(options.ifile) > 1: # multiple input for ifile in options.ifile[1:]: tp = options.parser(ifile, buffer_size=options.buffer_size) - treat = tp.append_fwtrack( treat ) - #treat.sort() + treat = tp.append_fwtrack(treat) + # treat.sort() treat.finalize() options.info("tag size is determined as %d bps" % options.tsize) return treat -def load_frag_files_options ( options ): + +def load_frag_files_options(options): """From the options, load treatment fragments and control fragments (if available). """ @@ -105,12 +109,12 @@ def load_frag_files_options ( options ): tp = options.parser(options.ifile[0], buffer_size=options.buffer_size) treat = tp.build_petrack() - #treat.sort() + # treat.sort() if len(options.ifile) > 1: # multiple input for ifile in options.ifile[1:]: tp = options.parser(ifile, buffer_size=options.buffer_size) - treat = tp.append_petrack( treat ) - #treat.sort() + treat = tp.append_petrack(treat) + # treat.sort() treat.finalize() return treat diff --git a/MACS3/Commands/refinepeak_cmd.py b/MACS3/Commands/refinepeak_cmd.py index de7c64b6..47f7610a 100644 --- a/MACS3/Commands/refinepeak_cmd.py +++ b/MACS3/Commands/refinepeak_cmd.py @@ -1,4 +1,4 @@ -# Time-stamp: <2020-11-30 16:14:14 Tao Liu> +# Time-stamp: <2024-10-02 17:01:42 Tao Liu> """Description: refine peak summits @@ -12,63 +12,70 @@ # ------------------------------------ import os -import sys from collections import Counter # ------------------------------------ # own python modules # ------------------------------------ -from MACS3.Utilities.Constants import * +# from MACS3.Utilities.Constants import * from MACS3.Utilities.OptValidator import opt_validate_refinepeak -from MACS3.Signal.Prob import binomial_cdf_inv +# from MACS3.Signal.Prob import binomial_cdf_inv from MACS3.IO.PeakIO import PeakIO # ------------------------------------ # Main function # ------------------------------------ -def run( o_options ): + + +def run(o_options): """The Main function/pipeline for duplication filter. """ # Parse options... - options = opt_validate_refinepeak( o_options ) + options = opt_validate_refinepeak(o_options) # end of parsing commandline options info = options.info - warn = options.warn - debug = options.debug - error = options.error + # warn = options.warn + # debug = options.debug + # error = options.error if options.ofile: - outputfile = open( os.path.join( options.outdir, options.ofile ), 'w' ) + outputfile = open(os.path.join(options.outdir, options.ofile), 'w') options.oprefix = options.ofile else: - outputfile = open( os.path.join( options.outdir, "%s_refinepeak.bed" % options.oprefix), "w" ) + outputfile = open(os.path.join(options.outdir, "%s_refinepeak.bed" % options.oprefix), "w") - - peakio = open(options.bedfile,"rb") + peakio = open(options.bedfile, "rb") peaks = PeakIO() - for l in peakio: - fs = l.rstrip().split() - peaks.add( fs[0], int(fs[1]), int(fs[2]), name=fs[3] ) + for l_p in peakio: + fs = l_p.rstrip().split() + peaks.add(fs[0], int(fs[1]), int(fs[2]), name=fs[3]) peaks.sort() peakio.close() - #1 Read tag files + # 1 Read tag files info("read tag files...") - fwtrack = load_tag_files_options (options) + fwtrack = load_tag_files_options(options) - retval = fwtrack.compute_region_tags_from_peaks( peaks, find_summit, window_size = options.windowsize, cutoff = options.cutoff ) - outputfile.write( (b"\n".join( [b"%s\t%d\t%d\t%s\t%.2f" % x for x in retval] )).decode() ) + retval = fwtrack.compute_region_tags_from_peaks(peaks, find_summit, window_size=options.windowsize, cutoff=options.cutoff) + outputfile.write((b"\n".join([b"%s\t%d\t%d\t%s\t%.2f" % x for x in retval])).decode()) outputfile.close() info("Done!") -def find_summit(chrom, plus, minus, peak_start, peak_end, name = b"peak", window_size=100, cutoff = 5): - left_sum = lambda strand, pos, width = window_size: sum([strand[x] for x in strand if x <= pos and x >= pos - width]) - right_sum = lambda strand, pos, width = window_size: sum([strand[x] for x in strand if x >= pos and x <= pos + width]) - left_forward = lambda strand, pos: strand.get(pos,0) - strand.get(pos-window_size, 0) - right_forward = lambda strand, pos: strand.get(pos + window_size, 0) - strand.get(pos, 0) +def find_summit(chrom, plus, minus, peak_start, peak_end, name=b"peak", window_size=100, cutoff=5): + def left_sum(strand, pos, width=window_size): + return sum([strand[x] for x in strand if x <= pos and x >= pos - width]) + + def right_sum(strand, pos, width=window_size): + return sum([strand[x] for x in strand if x >= pos and x <= pos + width]) + + def left_forward(strand, pos): + return strand.get(pos, 0) - strand.get(pos-window_size, 0) + + def right_forward(strand, pos): + return strand.get(pos + window_size, 0) - strand.get(pos, 0) watson, crick = (Counter(plus), Counter(minus)) watson_left = left_sum(watson, peak_start) @@ -82,32 +89,32 @@ def find_summit(chrom, plus, minus, peak_start, peak_end, name = b"peak", window watson_left += left_forward(watson, j) watson_right += right_forward(watson, j) crick_left += left_forward(crick, j) - crick_right += right_forward(crick,j) + crick_right += right_forward(crick, j) wtd_max_val = max(wtd_list) wtd_max_pos = wtd_list.index(wtd_max_val) + peak_start - #return (chrom, wtd_max_pos, wtd_max_pos+1, wtd_max_val) + # return (chrom, wtd_max_pos, wtd_max_pos+1, wtd_max_val) if wtd_max_val > cutoff: - return (chrom, wtd_max_pos, wtd_max_pos+1, name+b"_R" , wtd_max_val) # 'R'efined + return (chrom, wtd_max_pos, wtd_max_pos+1, name+b"_R", wtd_max_val) # 'R'efined else: - return (chrom, wtd_max_pos, wtd_max_pos+1, name+b"_F" , wtd_max_val) # 'F'ailed + return (chrom, wtd_max_pos, wtd_max_pos+1, name+b"_F", wtd_max_val) # 'F'ailed -def load_tag_files_options ( options ): + +def load_tag_files_options(options): """From the options, load alignment tags. """ options.info("# read treatment tags...") tp = options.parser(options.ifile[0], buffer_size=options.buffer_size) treat = tp.build_fwtrack() - #treat.sort() + # treat.sort() if len(options.ifile) > 1: # multiple input for tfile in options.ifile[1:]: tp = options.parser(tfile, buffer_size=options.buffer_size) - treat = tp.append_fwtrack( treat ) - #treat.sort() + treat = tp.append_fwtrack(treat) + # treat.sort() treat.finalize() return treat - diff --git a/MACS3/Utilities/Constants.py b/MACS3/Utilities/Constants.py index ebdb81c6..8e64282e 100644 --- a/MACS3/Utilities/Constants.py +++ b/MACS3/Utilities/Constants.py @@ -1,15 +1,14 @@ -MACS_VERSION = "3.0.2" +MACS_VERSION = "3.0.3b" MAX_PAIRNUM = 1000 -MAX_LAMBDA = 100000 -FESTEP = 20 -BUFFER_SIZE = 100000 # np array will increase at step of 1 million items -READ_BUFFER_SIZE = 10000000 # 10M bytes for read buffer size -N_MP = 2 # Number of processers +MAX_LAMBDA = 100000 +FESTEP = 20 +BUFFER_SIZE = 100000 # np array will increase at step of 1 million items +READ_BUFFER_SIZE = 10000000 # 10M bytes for read buffer size +N_MP = 2 # Number of processers -#Effective genome size, collected from -#https://deeptools.readthedocs.io/en/develop/content/feature/effectiveGenomeSize.html - -EFFECTIVEGS = {"hs":2913022398, #GRCh38 - "mm":2652783500, #GRCm38 - "ce":100286401, #WBcel235 - "dm":142573017} #dm6 +# Effective genome size, collected from +# https://deeptools.readthedocs.io/en/develop/content/feature/effectiveGenomeSize.html +EFFECTIVEGS = {"hs": 2913022398, # GRCh38 + "mm": 2652783500, # GRCm38 + "ce": 100286401, # WBcel235 + "dm": 142573017} # dm6 diff --git a/MACS3/Utilities/Logger.py b/MACS3/Utilities/Logger.py index 98eeace6..203a27e1 100644 --- a/MACS3/Utilities/Logger.py +++ b/MACS3/Utilities/Logger.py @@ -1,17 +1,19 @@ # Logger.py to set time and memory monitoring to logging import logging import resource -import time import os import sys + class MemoryLogger(logging.Logger): def __init__(self, name, level=logging.NOTSET): super().__init__(name, level) - - def _log(self, level, msg, args, exc_info=None, extra=None, stack_info=False): + + def _log(self, level, msg, args, exc_info=None, + extra=None, stack_info=False): mem_usage = self.get_memory_usage() - super()._log(level, f"[{mem_usage} MB] {msg}", args, exc_info, extra, stack_info) + super()._log(level, f"[{mem_usage} MB] {msg}", + args, exc_info, extra, stack_info) @staticmethod def get_memory_usage(): @@ -19,7 +21,8 @@ def get_memory_usage(): if os.name == 'posix' and os.uname().sysname == 'Darwin': # macOS mem_usage = mem_usage / 1024 # Convert to kilobytes - return int( mem_usage / 1024 ) # Convert to MB + return int(mem_usage / 1024) # Convert to MB + logging.basicConfig(level=20, format='%(levelname)-5s @ %(asctime)s: %(message)s ', @@ -28,4 +31,4 @@ def get_memory_usage(): filemode="w" ) -logging.setLoggerClass(MemoryLogger) \ No newline at end of file +logging.setLoggerClass(MemoryLogger) diff --git a/MACS3/Utilities/OptValidator.py b/MACS3/Utilities/OptValidator.py index f1a1a8c2..451fe0b4 100644 --- a/MACS3/Utilities/OptValidator.py +++ b/MACS3/Utilities/OptValidator.py @@ -1,5 +1,4 @@ -# Time-stamp: <2024-04-19 15:11:59 Tao Liu> - +# Time-stamp: <2024-10-02 19:47:03 Tao Liu> """Module Description This code is free software; you can redistribute it and/or modify it @@ -12,33 +11,31 @@ # ------------------------------------ import sys import os -import re -import resource # we turn on memory monitoring inside of logging -from argparse import ArgumentError -from subprocess import Popen, PIPE from math import log # ------------------------------------ # MACS3 modules # ------------------------------------ -from MACS3.IO.Parser import BEDParser, ELANDResultParser, ELANDMultiParser, \ - ELANDExportParser, SAMParser, BAMParser, BAMPEParser,\ - BEDPEParser, BowtieParser, guess_parser +from MACS3.IO.Parser import (BEDParser, ELANDResultParser, + ELANDMultiParser, ELANDExportParser, + SAMParser, BAMParser, BAMPEParser, + BEDPEParser, BowtieParser, guess_parser) from MACS3.Utilities.Constants import EFFECTIVEGS as efgsize + # ------------------------------------ # constants # ------------------------------------ -import logging -import MACS3.Utilities.Logger +from MACS3.Utilities.Logger import logging # ------------------------------------ # Misc functions # ------------------------------------ logger = logging.getLogger(__name__) -def opt_validate_callpeak ( options ): + +def opt_validate_callpeak(options): """Validate options from a OptParser object. Ret: Validated options object. @@ -46,18 +43,18 @@ def opt_validate_callpeak ( options ): # logging object logger.setLevel((4-options.verbose)*10) - options.error = logger.critical # function alias - options.warn = logger.warning - options.debug = logger.debug - options.info = logger.info - + options.error = logger.critical # function alias + options.warn = logger.warning + options.debug = logger.debug + options.info = logger.info + # gsize try: options.gsize = efgsize[options.gsize] - except: + except KeyError: try: options.gsize = float(options.gsize) - except: + except ValueError: logger.error("Error when interpreting --gsize option: %s" % options.gsize) logger.error("Available shortcuts of effective genome sizes are %s" % ",".join(list(efgsize.keys()))) sys.exit(1) @@ -100,17 +97,12 @@ def opt_validate_callpeak ( options ): logger.error("--keep-dup should be 'auto', 'all' or an integer!") sys.exit(1) - # shiftsize>0 - #if options.shiftsize: # only if --shiftsize is set, it's true - # options.extsize = 2 * options.shiftsize - #else: # if --shiftsize is not set - # options.shiftsize = options.extsize / 2 - if options.extsize < 1 : + if options.extsize < 1: logger.error("--extsize must >= 1!") sys.exit(1) # refine_peaks, call_summits can't be combined with --broad - #if options.broad and (options.refine_peaks or options.call_summits): + # if options.broad and (options.refine_peaks or options.call_summits): # logger.error("--broad can't be combined with --refine-peaks or --call-summits!") # sys.exit(1) @@ -121,12 +113,12 @@ def opt_validate_callpeak ( options ): if options.pvalue: # if set, ignore qvalue cutoff options.log_qvalue = None - options.log_pvalue = log(options.pvalue,10)*-1 + options.log_pvalue = log(options.pvalue, 10) * -1 else: - options.log_qvalue = log(options.qvalue,10)*-1 + options.log_qvalue = log(options.qvalue, 10) * -1 options.log_pvalue = None if options.broad: - options.log_broadcutoff = log(options.broadcutoff,10)*-1 + options.log_broadcutoff = log(options.broadcutoff, 10) * -1 # uppercase the format string options.format = options.format.upper() @@ -144,50 +136,56 @@ def opt_validate_callpeak ( options ): sys.exit(1) # output filenames - options.peakxls = os.path.join( options.outdir, options.name+"_peaks.xls" ) - options.peakbed = os.path.join( options.outdir, options.name+"_peaks.bed" ) - options.peakNarrowPeak = os.path.join( options.outdir, options.name+"_peaks.narrowPeak" ) - options.peakBroadPeak = os.path.join( options.outdir, options.name+"_peaks.broadPeak" ) - options.peakGappedPeak = os.path.join( options.outdir, options.name+"_peaks.gappedPeak" ) - options.summitbed = os.path.join( options.outdir, options.name+"_summits.bed" ) - options.bdg_treat = os.path.join( options.outdir, options.name+"_treat_pileup.bdg" ) - options.bdg_control= os.path.join( options.outdir, options.name+"_control_lambda.bdg" ) + options.peakxls = os.path.join(options.outdir, options.name + + "_peaks.xls") + options.peakbed = os.path.join(options.outdir, options.name + + "_peaks.bed") + options.peakNarrowPeak = os.path.join(options.outdir, options.name + + "_peaks.narrowPeak") + options.peakBroadPeak = os.path.join(options.outdir, options.name + + "_peaks.broadPeak") + options.peakGappedPeak = os.path.join(options.outdir, options.name + + "_peaks.gappedPeak") + options.summitbed = os.path.join(options.outdir, options.name + + "_summits.bed") + options.bdg_treat = os.path.join(options.outdir, options.name + + "_treat_pileup.bdg") + options.bdg_control = os.path.join(options.outdir, options.name + + "_control_lambda.bdg") if options.cutoff_analysis: - options.cutoff_analysis_file = os.path.join( options.outdir, options.name+"_cutoff_analysis.txt" ) + options.cutoff_analysis_file = os.path.join(options.outdir, options.name + + "_cutoff_analysis.txt") else: options.cutoff_analysis_file = "None" - #options.negxls = os.path.join( options.name+"_negative_peaks.xls" ) - #options.diagxls = os.path.join( options.name+"_diag.xls" ) - options.modelR = os.path.join( options.outdir, options.name+"_model.r" ) - #options.pqtable = os.path.join( options.outdir, options.name+"_pq_table.txt" ) + options.modelR = os.path.join(options.outdir, options.name+"_model.r") options.argtxt = "\n".join(( - "# Command line: %s" % " ".join(sys.argv[1:]),\ - "# ARGUMENTS LIST:",\ - "# name = %s" % (options.name),\ - "# format = %s" % (options.format),\ - "# ChIP-seq file = %s" % (options.tfile),\ - "# control file = %s" % (options.cfile),\ - "# effective genome size = %.2e" % (options.gsize),\ - #"# tag size = %d" % (options.tsize),\ - "# band width = %d" % (options.bw),\ - "# model fold = %s\n" % (options.mfold),\ - )) + "# Command line: %s" % " ".join(sys.argv[1:]), + "# ARGUMENTS LIST:", + "# name = %s" % (options.name), + "# format = %s" % (options.format), + "# ChIP-seq file = %s" % (options.tfile), + "# control file = %s" % (options.cfile), + "# effective genome size = %.2e" % (options.gsize), + # "# tag size = %d" % (options.tsize), + "# band width = %d" % (options.bw), + "# model fold = %s\n" % (options.mfold), + )) if options.pvalue: if options.broad: - options.argtxt += "# pvalue cutoff for narrow/strong regions = %.2e\n" % (options.pvalue) - options.argtxt += "# pvalue cutoff for broad/weak regions = %.2e\n" % (options.broadcutoff) - options.argtxt += "# qvalue will not be calculated and reported as -1 in the final output.\n" + options.argtxt += "# pvalue cutoff for narrow/strong regions = %.2e\n" % (options.pvalue) + options.argtxt += "# pvalue cutoff for broad/weak regions = %.2e\n" % (options.broadcutoff) + options.argtxt += "# qvalue will not be calculated and reported as -1 in the final output.\n" else: - options.argtxt += "# pvalue cutoff = %.2e\n" % (options.pvalue) - options.argtxt += "# qvalue will not be calculated and reported as -1 in the final output.\n" + options.argtxt += "# pvalue cutoff = %.2e\n" % (options.pvalue) + options.argtxt += "# qvalue will not be calculated and reported as -1 in the final output.\n" else: if options.broad: - options.argtxt += "# qvalue cutoff for narrow/strong regions = %.2e\n" % (options.qvalue) - options.argtxt += "# qvalue cutoff for broad/weak regions = %.2e\n" % (options.broadcutoff) + options.argtxt += "# qvalue cutoff for narrow/strong regions = %.2e\n" % (options.qvalue) + options.argtxt += "# qvalue cutoff for broad/weak regions = %.2e\n" % (options.broadcutoff) else: - options.argtxt += "# qvalue cutoff = %.2e\n" % (options.qvalue) + options.argtxt += "# qvalue cutoff = %.2e\n" % (options.qvalue) if options.maxgap: options.argtxt += "# The maximum gap between significant sites = %d\n" % options.maxgap @@ -231,7 +229,7 @@ def opt_validate_callpeak ( options ): else: options.argtxt += "# Paired-End mode is off\n" - #if options.refine_peaks: + # if options.refine_peaks: # options.argtxt += "# Refining peak for read balance is on\n" if options.call_summits: options.argtxt += "# Searching for subpeak summits is on\n" @@ -241,106 +239,8 @@ def opt_validate_callpeak ( options ): return options -def opt_validate_diffpeak ( options ): - """Validate options from a OptParser object. - - Ret: Validated options object. - """ - # logging object - logger.setLevel((4-options.verbose)*10) - - options.error = logger.critical # function alias - options.warn = logger.warning - options.debug = logger.debug - options.info = logger.info - - # format - options.gzip_flag = False # if the input is gzip file - -# options.format = options.format.upper() - # fox this stuff -# if True: pass -# elif options.format == "AUTO": -# options.parser = guess_parser -# else: -# logger.error("Format \"%s\" cannot be recognized!" % (options.format)) -# sys.exit(1) - - if options.peaks_pvalue: - # if set, ignore qvalue cutoff - options.peaks_log_qvalue = None - options.peaks_log_pvalue = log(options.peaks_pvalue,10)*-1 - options.track_score_method = 'p' - else: - options.peaks_log_qvalue = log(options.peaks_qvalue,10)*-1 - options.peaks_log_pvalue = None - options.track_score_method = 'q' - - if options.diff_pvalue: - # if set, ignore qvalue cutoff - options.log_qvalue = None - options.log_pvalue = log(options.diff_pvalue,10)*-1 - options.score_method = 'p' - else: - options.log_qvalue = log(options.diff_qvalue,10)*-1 - options.log_pvalue = None - options.score_method = 'q' - - # output filenames - options.peakxls = options.name+"_diffpeaks.xls" - options.peakbed = options.name+"_diffpeaks.bed" - options.peak1xls = options.name+"_diffpeaks_by_peaks1.xls" - options.peak2xls = options.name+"_diffpeaks_by_peaks2.xls" - options.bdglogLR = options.name+"_logLR.bdg" - options.bdgpvalue = options.name+"_logLR.bdg" - options.bdglogFC = options.name+"_logLR.bdg" - - options.call_peaks = True - if not (options.peaks1 == '' or options.peaks2 == ''): - if options.peaks1 == '': - raise ArgumentError('peaks1', 'Must specify both peaks1 and peaks2, or neither (to call peaks again)') - elif options.peaks2 == '': - raise ArgumentError('peaks2', 'Must specify both peaks1 and peaks2, or neither (to call peaks again)') - options.call_peaks = False - options.argtxt = "\n".join(( - "# ARGUMENTS LIST:",\ - "# name = %s" % (options.name),\ -# "# format = %s" % (options.format),\ - "# ChIP-seq file 1 = %s" % (options.t1bdg),\ - "# control file 1 = %s" % (options.c1bdg),\ - "# ChIP-seq file 2 = %s" % (options.t2bdg),\ - "# control file 2 = %s" % (options.c2bdg),\ - "# Peaks, condition 1 = %s" % (options.peaks1),\ - "# Peaks, condition 2 = %s" % (options.peaks2),\ - "" - )) - else: - options.argtxt = "\n".join(( - "# ARGUMENTS LIST:",\ - "# name = %s" % (options.name),\ -# "# format = %s" % (options.format),\ - "# ChIP-seq file 1 = %s" % (options.t1bdg),\ - "# control file 1 = %s" % (options.c1bdg),\ - "# ChIP-seq file 2 = %s" % (options.t2bdg),\ - "# control file 2 = %s" % (options.c2bdg),\ - "" - )) - - if options.peaks_pvalue: - options.argtxt += "# treat/control -log10(pvalue) cutoff = %.2e\n" % (options.peaks_log_pvalue) - options.argtxt += "# treat/control -log10(qvalue) will not be calculated and reported as -1 in the final output.\n" - else: - options.argtxt += "# treat/control -log10(qvalue) cutoff = %.2e\n" % (options.peaks_log_qvalue) - - if options.diff_pvalue: - options.argtxt += "# differential pvalue cutoff = %.2e\n" % (options.log_pvalue) - options.argtxt += "# differential qvalue will not be calculated and reported as -1 in the final output.\n" - else: - options.argtxt += "# differential qvalue cutoff = %.2e\n" % (options.log_qvalue) - - return options -def opt_validate_filterdup ( options ): +def opt_validate_filterdup(options): """Validate options from a OptParser object. Ret: Validated options object. @@ -348,18 +248,18 @@ def opt_validate_filterdup ( options ): # logging object logger.setLevel((4-options.verbose)*10) - options.error = logger.critical # function alias - options.warn = logger.warning - options.debug = logger.debug - options.info = logger.info - + options.error = logger.critical # function alias + options.warn = logger.warning + options.debug = logger.debug + options.info = logger.info + # gsize try: options.gsize = efgsize[options.gsize] - except: + except KeyError: try: options.gsize = float(options.gsize) - except: + except ValueError: logger.error("Error when interpreting --gsize option: %s" % options.gsize) logger.error("Available shortcuts of effective genome sizes are %s" % ",".join(list(efgsize.keys()))) sys.exit(1) @@ -408,7 +308,8 @@ def opt_validate_filterdup ( options ): return options -def opt_validate_randsample ( options ): + +def opt_validate_randsample(options): """Validate options from a OptParser object. Ret: Validated options object. @@ -416,11 +317,11 @@ def opt_validate_randsample ( options ): # logging object logger.setLevel((4-options.verbose)*10) - options.error = logger.critical # function alias - options.warn = logger.warning - options.debug = logger.debug - options.info = logger.info - + options.error = logger.critical # function alias + options.warn = logger.warning + options.debug = logger.debug + options.info = logger.info + # format options.gzip_flag = False # if the input is gzip file @@ -467,7 +368,8 @@ def opt_validate_randsample ( options ): return options -def opt_validate_refinepeak ( options ): + +def opt_validate_refinepeak(options): """Validate options from a OptParser object. Ret: Validated options object. @@ -475,11 +377,11 @@ def opt_validate_refinepeak ( options ): # logging object logger.setLevel((4-options.verbose)*10) - options.error = logger.critical # function alias - options.warn = logger.warning - options.debug = logger.debug - options.info = logger.info - + options.error = logger.critical # function alias + options.warn = logger.warning + options.debug = logger.debug + options.info = logger.info + # format options.gzip_flag = False # if the input is gzip file @@ -511,7 +413,8 @@ def opt_validate_refinepeak ( options ): return options -def opt_validate_predictd ( options ): + +def opt_validate_predictd(options): """Validate options from a OptParser object. Ret: Validated options object. @@ -519,18 +422,18 @@ def opt_validate_predictd ( options ): # logging object logger.setLevel((4-options.verbose)*10) - options.error = logger.critical # function alias - options.warn = logger.warning - options.debug = logger.debug - options.info = logger.info - + options.error = logger.critical # function alias + options.warn = logger.warning + options.debug = logger.debug + options.info = logger.info + # gsize try: options.gsize = efgsize[options.gsize] - except: + except KeyError: try: options.gsize = float(options.gsize) - except: + except ValueError: logger.error("Error when interpreting --gsize option: %s" % options.gsize) logger.error("Available shortcuts of effective genome sizes are %s" % ",".join(list(efgsize.keys()))) sys.exit(1) @@ -582,12 +485,12 @@ def opt_validate_predictd ( options ): logger.error("Upper limit of mfold should be greater than lower limit!" % options.mfold) sys.exit(1) - options.modelR = os.path.join( options.outdir, options.rfile ) + options.modelR = os.path.join(options.outdir, options.rfile) return options -def opt_validate_pileup ( options ): +def opt_validate_pileup(options): """Validate options from a OptParser object. Ret: Validated options object. @@ -595,11 +498,11 @@ def opt_validate_pileup ( options ): # logging object logger.setLevel((4-options.verbose)*10) - options.error = logger.critical # function alias - options.warn = logger.warning - options.debug = logger.debug - options.info = logger.info - + options.error = logger.critical # function alias + options.warn = logger.warning + options.debug = logger.debug + options.info = logger.info + # format options.gzip_flag = False # if the input is gzip file @@ -633,13 +536,14 @@ def opt_validate_pileup ( options ): options.format = options.format.upper() # extsize - if options.extsize <= 0 : + if options.extsize <= 0: logger.error("--extsize must > 0!") sys.exit(1) return options -def opt_validate_bdgcmp ( options ): + +def opt_validate_bdgcmp(options): """Validate options from a OptParser object. Ret: Validated options object. @@ -647,17 +551,18 @@ def opt_validate_bdgcmp ( options ): # logging object logger.setLevel((4-options.verbose)*10) - options.error = logger.critical # function alias - options.warn = logger.warning - options.debug = logger.debug - options.info = logger.info + options.error = logger.critical # function alias + options.warn = logger.warning + options.debug = logger.debug + options.info = logger.info # methods should be valid: for method in set(options.method): - if method not in [ 'ppois', 'qpois', 'subtract', 'logFE', 'FE', 'logLR', 'slogLR', 'max' ]: - logger.error( "Invalid method: %s" % method ) - sys.exit( 1 ) + if method not in ['ppois', 'qpois', 'subtract', 'logFE', 'FE', + 'logLR', 'slogLR', 'max']: + logger.error("Invalid method: %s" % method) + sys.exit(1) # # of --ofile must == # of -m @@ -669,7 +574,7 @@ def opt_validate_bdgcmp ( options ): return options -def opt_validate_cmbreps ( options ): +def opt_validate_cmbreps(options): """Validate options from a OptParser object. Ret: Validated options object. @@ -677,39 +582,37 @@ def opt_validate_cmbreps ( options ): # logging object logger.setLevel((4-options.verbose)*10) - options.error = logger.critical # function alias - options.warn = logger.warning - options.debug = logger.debug - options.info = logger.info + options.error = logger.critical # function alias + options.warn = logger.warning + options.debug = logger.debug + options.info = logger.info # methods should be valid: + if options.method not in ['fisher', 'max', 'mean']: + logger.error("Invalid method: %s" % options.method) + sys.exit(1) - if options.method not in [ 'fisher', 'max', 'mean']: - logger.error( "Invalid method: %s" % options.method ) - sys.exit( 1 ) - - if len( options.ifile ) < 2: + if len(options.ifile) < 2: logger.error("Combining replicates needs at least two replicates!") - sys.exit( 1 ) + sys.exit(1) # # of -i must == # of -w # if not options.weights: - # options.weights = [ 1.0 ] * len( options.ifile ) + # options.weights = [ 1.0 ] * len(options.ifile) - # if len( options.ifile ) != len( options.weights ): + # if len(options.ifile) != len(options.weights): # logger.error("Must provide same number of weights as number of input files.") - # sys.exit( 1 ) + # sys.exit(1) - # if options.method == "fisher" and len( options.ifile ) > 3: + # if options.method == "fisher" and len(options.ifile) > 3: # logger.error("NOT IMPLEMENTED! Can't combine more than 3 replicates using Fisher's method.") - # sys.exit( 1 ) - + # sys.exit(1) return options -def opt_validate_bdgopt ( options ): +def opt_validate_bdgopt(options): """Validate options from a OptParser object. Ret: Validated options object. @@ -717,24 +620,25 @@ def opt_validate_bdgopt ( options ): # logging object logger.setLevel((4-options.verbose)*10) - options.error = logger.critical # function alias - options.warn = logger.warning - options.debug = logger.debug - options.info = logger.info + options.error = logger.critical # function alias + options.warn = logger.warning + options.debug = logger.debug + options.info = logger.info # methods should be valid: - if options.method.lower() not in [ 'multiply', 'add', 'p2q', 'max', 'min']: - logger.error( "Invalid method: %s" % options.method ) - sys.exit( 1 ) + if options.method.lower() not in ['multiply', 'add', 'p2q', 'max', 'min']: + logger.error("Invalid method: %s" % options.method) + sys.exit(1) - if options.method.lower() in [ 'multiply', 'add' ] and not options.extraparam: - logger.error( "Need EXTRAPARAM for method multiply or add!") - sys.exit( 1 ) + if options.method.lower() in ['multiply', 'add'] and not options.extraparam: + logger.error("Need EXTRAPARAM for method multiply or add!") + sys.exit(1) return options -def opt_validate_callvar ( options ): + +def opt_validate_callvar(options): """Validate options from a OptParser object. Ret: Validated options object. @@ -742,10 +646,10 @@ def opt_validate_callvar ( options ): # logging object logger.setLevel((4-options.verbose)*10) - options.error = logger.critical # function alias - options.warn = logger.warning - options.debug = logger.debug - options.info = logger.info + options.error = logger.critical # function alias + options.warn = logger.warning + options.debug = logger.debug + options.info = logger.info # methods should be valid: @@ -754,7 +658,7 @@ def opt_validate_callvar ( options ): return options -def opt_validate_hmmratac ( options ): +def opt_validate_hmmratac(options): """Validate options from a OptParser object. Ret: Validated options object. @@ -762,11 +666,11 @@ def opt_validate_hmmratac ( options ): # logging object logger.setLevel((4-options.verbose)*10) - options.error = logger.critical # function alias - options.warn = logger.warning - options.debug = logger.debug - options.info = logger.info - + options.error = logger.critical # function alias + options.warn = logger.warning + options.debug = logger.debug + options.info = logger.info + # input options.argtxt for hmmratac options.argtxt = "# Command line: %s\n" % " ".join(sys.argv[1:]) # "# ARGUMENTS LIST:",\ @@ -777,7 +681,7 @@ def opt_validate_hmmratac ( options ): # Output options #if options.store_bdg: # options.argtxt += "# HMMRATAC will report whole genome bedgraph of all state annotations. \n" - + #if options.store_bgscore: # options.argtxt += "# HMMRATAC score will be added to each state annotation in bedgraph. \n" @@ -788,77 +692,76 @@ def opt_validate_hmmratac ( options ): # options.print_exclude = os.path.join(options.outdir, options.ofile+"Output_exclude.bed") #else: # options.print_exclude = "None" - + #if options.print_train: # options.print_train = os.path.join(options.outdir, options.ofile+"Output_training.bed") #else: # options.print_train = "None" - # EM # em_skip if options.em_skip: options.argtxt += "# EM training not performed on fragment distribution. \n" # em_means non-negative - if sum( [ x < 0 for x in options.em_means ] ): - logger.error(" --means should not be negative! ") - sys.exit( 1 ) + if sum([x < 0 for x in options.em_means]): + logger.error(" `--means` should not be negative! ") + sys.exit(1) # em_stddev non-negative - if sum( [ x < 0 for x in options.em_stddevs ] ): - logger.error(" --stddev should not be negative! ") - sys.exit( 1 ) + if sum([x < 0 for x in options.em_stddevs]): + logger.error(" `--stddev` should not be negative! ") + sys.exit(1) # min_frag_p between 0 and 1 - if options.min_frag_p <=0 or options.min_frag_p >= 1: - logger.error(" --min-frag-p should be larger than 0 and smaller than 1! ") - sys.exit( 1 ) + if options.min_frag_p <= 0 or options.min_frag_p >= 1: + logger.error(" `--min-frag-p` should be larger than 0 and smaller than 1!") + sys.exit(1) # HMM # hmm_states non-negative int, warn if not k=3 - #if options.hmm_states <=0: + # if options.hmm_states <=0: # logger.error(" -s, --states must be an integer >= 0.") - # sys.exit( 1 ) + # sys.exit(1) #elif options.hmm_states != 3 and options.hmm_states > 0 and options.store_peaks == False: # logger.warn(" If -s, --states not k=3, recommend NOT calling peaks, use bedgraph.") # hmm_binsize > 0 - if options.hmm_binsize <=0: - logger.error(" --binsize must be larger than 0.") - sys.exit( 1 ) - - # hmm_lower less than hmm_upper, non-negative - if options.hmm_lower <0: - logger.error(" -l, --lower should not be negative! ") - sys.exit( 1 ) - if options.hmm_upper <0: - logger.error(" -u, --upper should not be negative! ") - sys.exit( 1 ) + if options.hmm_binsize <= 0: + logger.error(" `--binsize` must be larger than 0.") + sys.exit(1) + + # hmm_lower less than hmm_upper, non-negative + if options.hmm_lower < 0: + logger.error(" `-l` or `--lower` should not be negative! ") + sys.exit(1) + if options.hmm_upper < 0: + logger.error(" `-u` or `--upper` should not be negative! ") + sys.exit(1) if options.hmm_lower > options.hmm_upper: logger.error("Upper limit of fold change range should be greater than lower limit!" % options.mfold) sys.exit(1) - + # hmm_maxTrain non-negative if options.hmm_maxTrain <= 0: - logger.error(" --maxTrain should be larger than 0!") - sys.exit( 1 ) - + logger.error(" `--maxTrain` should be larger than 0!") + sys.exit(1) + # hmm_training_regions if options.hmm_training_regions: - options.argtxt += "# Using -t, --training input to train HMM instead of using fold change settings to select. \n" - + options.argtxt += "# Using -t, --training input to train HMM instead of using fold change settings to select. \n" + # hmm_zscore non-negative - #if options.hmm_zscore <0: + # if options.hmm_zscore <0: # logger.error(" -z, --zscore should not be negative!") - # sys.exit( 1 ) - + # sys.exit(1) + # hmm_randomSeed if options.hmm_randomSeed: options.argtxt += "# Random seed selected as: %d\n" % options.hmm_randomSeed - + # hmm_window non-negative #if options.hmm_window <0: # logger.error(" --window should not be negative! ") - # sys.exit( 1 ) + # sys.exit(1) # hmm_file #if options.hmm_file: @@ -866,53 +769,47 @@ def opt_validate_hmmratac ( options ): # hmm_modelonly if options.hmm_modelonly: - options.argtxt += "# Program will stop after generating model, which can be later applied with '--model'. \n" + options.argtxt += "# Program will stop after generating model, which can be later applied with '--model'. \n" # hmm_modelType if options.hmm_type: - options.argtxt += "# Use --hmm-type to select a Gaussian ('gaussian') or Poisson ('poisson') model for the hidden markov model in HMMRATAC. Default: 'gaussian'. \n" - + options.argtxt += "# Use --hmm-type to select a Gaussian ('gaussian') or Poisson ('poisson') model for the hidden markov model in HMMRATAC. Default: 'gaussian'. \n" # Peak Calling if options.prescan_cutoff <= 1: logger.error(" In order to use -c or --prescan-cutoff, the cutoff must be larger than 1.") - sys.exit( 1 ) - + sys.exit(1) + if options.openregion_minlen < 0: # and options.store_peaks == True: logger.error(" In order to use --minlen, the length should not be negative.") - sys.exit( 1 ) + sys.exit(1) #if options.call_score.lower() not in [ 'max', 'ave', 'med', 'fc', 'zscore', 'all']: - # logger.error( " Invalid method: %s" % options.call_score ) - # sys.exit( 1 ) + # logger.error(" Invalid method: %s" % options.call_score) + # sys.exit(1) # call_threshold non-negative #if options.call_threshold <0: # logger.error(" --threshold should not be negative! ") - # sys.exit( 1 ) - + # sys.exit(1) # Misc - # misc_blacklist + # misc_blacklist #if options.misc_keep_duplicates: # options.argtxt += "# Duplicate reads from analysis will be stored. \n" # misc_trim non-negative #if options.misc_trim <0: # logger.error(" --trim should not be negative! ") - # sys.exit( 1 ) + # sys.exit(1) # np # should this be mp? non-negative #if options.np <0: # logger.error(" -m, --multiple-processing should not be negative! ") - # sys.exit( 1 ) - + # sys.exit(1) + # min_map_quality non-negative #if options.min_map_quality <0: # logger.error(" -q, --minmapq should not be negative! ") - # sys.exit( 1 ) - - + # sys.exit(1) return options - - diff --git a/bin/macs3 b/bin/macs3 index d9d49899..0df21cf8 100644 --- a/bin/macs3 +++ b/bin/macs3 @@ -1,5 +1,5 @@ #!/usr/bin/env python -# Time-stamp: <2024-07-19 13:14:53 Tao Liu> +# Time-stamp: <2024-10-02 12:29:58 Tao Liu> """Description: MACS v3 main executable. @@ -20,11 +20,13 @@ import tempfile # ------------------------------------ # own python modules # ------------------------------------ -from MACS3.Utilities.Constants import * +from MACS3.Utilities.Constants import MACS_VERSION # ------------------------------------ # Main function # ------------------------------------ + + def main(): """The Main function/pipeline for MACS. @@ -33,75 +35,79 @@ def main(): argparser = prepare_argparser() args = argparser.parse_args() - subcommand = args.subcommand + subcommand = args.subcommand if args.outdir: # use a output directory to store MACS output - if not os.path.exists( args.outdir ): + if not os.path.exists(args.outdir): try: - os.makedirs( args.outdir ) - except: - sys.exit( "Output directory (%s) could not be created. Terminating program." % args.outdir ) + os.makedirs(args.outdir) + except FileExistsError: + sys.exit("Output directory (%s) could not be created since it already exists. Terminate program." % args.outdir) + except PermissionError: + sys.exit("Output directory (%s) could not be created due to permission. Terminate program." % args.outdir) + except Exception: + sys.exit("Output directory (%s) could not be created. Terminate program." % args.outdir) if subcommand == "callpeak": # General call peak from MACS3.Commands.callpeak_cmd import run - run( args ) + run(args) elif subcommand == "bdgpeakcall": # call peak from bedGraph from MACS3.Commands.bdgpeakcall_cmd import run - run( args ) + run(args) elif subcommand == "bdgbroadcall": # call broad peak from bedGraph from MACS3.Commands.bdgbroadcall_cmd import run - run( args ) + run(args) elif subcommand == "bdgcmp": # compare treatment and control to make enrichment scores from MACS3.Commands.bdgcmp_cmd import run - run( args ) + run(args) elif subcommand == "bdgopt": # operations on the score column of bedGraph file from MACS3.Commands.bdgopt_cmd import run - run( args ) + run(args) elif subcommand == "cmbreps": # combine replicates from MACS3.Commands.cmbreps_cmd import run - run( args ) + run(args) elif subcommand == "randsample": # randomly sample sequencing reads, and save as bed file from MACS3.Commands.randsample_cmd import run - run( args ) + run(args) elif subcommand == "filterdup": # filter out duplicate reads, and save as bed file from MACS3.Commands.filterdup_cmd import run - run( args ) + run(args) elif subcommand == "bdgdiff": # differential calling from MACS3.Commands.bdgdiff_cmd import run - run( args ) + run(args) elif subcommand == "refinepeak": # refine peak summits from MACS3.Commands.refinepeak_cmd import run - run( args ) + run(args) elif subcommand == "predictd": # predict d or fragment size from MACS3.Commands.predictd_cmd import run - run( args ) + run(args) elif subcommand == "pileup": # pileup alignment results with a given extension method from MACS3.Commands.pileup_cmd import run - run( args ) + run(args) elif subcommand == "hmmratac": - # pileup alignment results with a given extension method + # use HMMRATAC algorithm to call ATAC-seq peaks from MACS3.Commands.hmmratac_cmd import run - run( args ) + run(args) elif subcommand == "callvar": # assemble reads in peak region and call variants from MACS3.Commands.callvar_cmd import run - run( args ) + run(args) -def prepare_argparser (): +def prepare_argparser(): """Prepare optparser object. New options will be added in this function first. @@ -109,504 +115,513 @@ def prepare_argparser (): description = "%(prog)s -- Model-based Analysis for ChIP-Sequencing" epilog = "For command line options of each command, type: %(prog)s COMMAND -h" # top-level parser - argparser = ap.ArgumentParser( description = description, epilog = epilog ) #, usage = usage ) - argparser.add_argument("--version", action="version", version="%(prog)s "+MACS_VERSION) - subparsers = argparser.add_subparsers( dest = 'subcommand' ) + argparser = ap.ArgumentParser(description=description, epilog=epilog) + argparser.add_argument("--version", action="version", version="%(prog)s " + MACS_VERSION) + subparsers = argparser.add_subparsers(dest='subcommand') subparsers.required = True # command for 'callpeak' - add_callpeak_parser( subparsers ) + add_callpeak_parser(subparsers) # command for 'bdgpeakcall' - add_bdgpeakcall_parser( subparsers ) + add_bdgpeakcall_parser(subparsers) # command for 'bdgbroadcall' - add_bdgbroadcall_parser( subparsers ) + add_bdgbroadcall_parser(subparsers) # command for 'bdgcmp' - add_bdgcmp_parser( subparsers ) + add_bdgcmp_parser(subparsers) # command for 'bdgopt' - add_bdgopt_parser( subparsers ) + add_bdgopt_parser(subparsers) # command for 'cmbreps' - add_cmbreps_parser( subparsers ) + add_cmbreps_parser(subparsers) # command for 'bdgdiff' - add_bdgdiff_parser( subparsers ) + add_bdgdiff_parser(subparsers) # command for 'filterdup' - add_filterdup_parser( subparsers ) + add_filterdup_parser(subparsers) # command for 'predictd' - add_predictd_parser( subparsers ) + add_predictd_parser(subparsers) # command for 'pileup' - add_pileup_parser( subparsers ) + add_pileup_parser(subparsers) # command for 'randsample' - add_randsample_parser( subparsers ) + add_randsample_parser(subparsers) # command for 'refinepeak' - add_refinepeak_parser( subparsers ) + add_refinepeak_parser(subparsers) # command for 'callvar' - add_callvar_parser( subparsers ) + add_callvar_parser(subparsers) # command for 'hmmratac' - add_hmmratac_parser( subparsers ) + add_hmmratac_parser(subparsers) return argparser -def add_outdir_option ( parser ): - parser.add_argument("--outdir", dest = "outdir", type = str, default = '', - help = "If specified all output files will be written to that directory. Default: the current working directory") -def add_output_group ( parser, required = True ): - output_group = parser.add_mutually_exclusive_group( required = required ) - output_group.add_argument( "-o", "--ofile", dest = "ofile", type = str, - help = "Output file name. Mutually exclusive with --o-prefix." ) - output_group.add_argument( "--o-prefix", dest = "oprefix", type = str, - help = "Output file prefix. Mutually exclusive with -o/--ofile." ) +def add_outdir_option(parser): + parser.add_argument("--outdir", dest="outdir", type=str, default='', + help="If specified all output files will be written to that directory. Default: the current working directory") + + +def add_output_group(parser, required=True): + output_group = parser.add_mutually_exclusive_group(required=required) + output_group.add_argument("-o", "--ofile", dest="ofile", type=str, + help="Output file name. Mutually exclusive with --o-prefix.") + output_group.add_argument("--o-prefix", dest="oprefix", type=str, + help="Output file prefix. Mutually exclusive with -o/--ofile.") -def add_callpeak_parser( subparsers ): + +def add_callpeak_parser(subparsers): """Add main function 'peak calling' argument parsers. """ argparser_callpeak = subparsers.add_parser("callpeak", help="Main MACS3 Function: Call peaks from alignment results.", - formatter_class = ap.RawDescriptionHelpFormatter, - epilog = """Examples: + formatter_class=ap.RawDescriptionHelpFormatter, + epilog="""Examples: 1. Peak calling for regular TF ChIP-seq: $ macs3 callpeak -t ChIP.bam -c Control.bam -f BAM -g hs -n test -B -q 0.01 2. Broad peak calling on Histone Mark ChIP-seq: $ macs3 callpeak -t ChIP.bam -c Control.bam --broad -g hs --broad-cutoff 0.1 3. Peak calling on ATAC-seq (paired-end mode): $ macs3 callpeak -f BAMPE -t ATAC.bam -g hs -n test -B -q 0.01 -4. Peak calling on ATAC-seq ( focusing on insertion sites, and using single-end mode): +4. Peak calling on ATAC-seq (focusing on insertion sites, and using single-end mode): $ macs3 callpeak -f BAM -t ATAC.bam -g hs -n test -B -q 0.01 --shift -50 --extension 100 """) # group for input files - group_input = argparser_callpeak.add_argument_group( "Input files arguments" ) - group_input.add_argument( "-t", "--treatment", dest = "tfile", type = str, required = True, nargs = "+", - help = "ChIP-seq treatment file. If multiple files are given as '-t A B C', then they will all be read and pooled together. REQUIRED." ) - group_input.add_argument( "-c", "--control", dest = "cfile", type = str, nargs = "*", - help = "Control file. If multiple files are given as '-c A B C', they will be pooled to estimate ChIP-seq background noise.") - group_input.add_argument( "-f", "--format", dest = "format", type = str, - choices = ("AUTO", "BAM", "SAM", "BED", "ELAND", - "ELANDMULTI", "ELANDEXPORT", "BOWTIE", - "BAMPE", "BEDPE"), - help = "Format of tag file, \"AUTO\", \"BED\" or \"ELAND\" or \"ELANDMULTI\" or \"ELANDEXPORT\" or \"SAM\" or \"BAM\" or \"BOWTIE\" or \"BAMPE\" or \"BEDPE\". The default AUTO option will let MACS decide which format (except for BAMPE and BEDPE which should be implicitly set) the file is. Please check the definition in README. Please note that if the format is set as BAMPE or BEDPE, MACS3 will call its special Paired-end mode to call peaks by piling up the actual ChIPed fragments defined by both aligned ends, instead of predicting the fragment size first and extending reads. Also please note that the BEDPE only contains three columns, and is NOT the same BEDPE format used by BEDTOOLS. DEFAULT: \"AUTO\"", - default = "AUTO" ) - group_input.add_argument( "-g", "--gsize", dest = "gsize", type = str, default = "hs", - help = "Effective genome size. It can be 1.0e+9 or 1000000000, or shortcuts:'hs' for human (2,913,022,398), 'mm' for mouse (2,652,783,500), 'ce' for C. elegans (100,286,401) and 'dm' for fruitfly (142,573,017), Default:hs. The effective genome size numbers for the above four species are collected from Deeptools https://deeptools.readthedocs.io/en/develop/content/feature/effectiveGenomeSize.html Please refer to deeptools to define the best genome size you plan to use." ) - group_input.add_argument( "-s", "--tsize", dest = "tsize", type = int, default = None, - help = "Tag size/read length. This will override the auto detected tag size. DEFAULT: Not set") - group_input.add_argument( "--keep-dup", dest = "keepduplicates", type = str, default = "1", - help = "It controls the behavior towards duplicate tags at the exact same location -- the same coordination and the same strand. The 'auto' option makes MACS calculate the maximum tags at the exact same location based on binomal distribution using 1e-5 as pvalue cutoff; and the 'all' option keeps every tags. If an integer is given, at most this number of tags will be kept at the same location. Note, if you've used samtools or picard to flag reads as 'PCR/Optical duplicate' in bit 1024, MACS3 will still read them although the reads may be decided by MACS3 as duplicate later. If you plan to rely on samtools/picard/any other tool to filter duplicates, please remove those duplicate reads and save a new alignment file then ask MACS3 to keep all by '--keep-dup all'. The default is to keep one tag at the same location. Default: 1" ) + group_input = argparser_callpeak.add_argument_group("Input files arguments") + group_input.add_argument("-t", "--treatment", dest="tfile", type=str, required=True, nargs="+", + help="ChIP-seq treatment file. If multiple files are given as '-t A B C', then they will all be read and pooled together. REQUIRED.") + group_input.add_argument("-c", "--control", dest="cfile", type=str, nargs="*", + help="Control file. If multiple files are given as '-c A B C', they will be pooled to estimate ChIP-seq background noise.") + group_input.add_argument("-f", "--format", dest="format", type=str, + choices=("AUTO", "BAM", "SAM", "BED", "ELAND", + "ELANDMULTI", "ELANDEXPORT", "BOWTIE", + "BAMPE", "BEDPE"), + help="Format of tag file, \"AUTO\", \"BED\" or \"ELAND\" or \"ELANDMULTI\" or \"ELANDEXPORT\" or \"SAM\" or \"BAM\" or \"BOWTIE\" or \"BAMPE\" or \"BEDPE\". The default AUTO option will let MACS decide which format (except for BAMPE and BEDPE which should be implicitly set) the file is. Please check the definition in README. Please note that if the format is set as BAMPE or BEDPE, MACS3 will call its special Paired-end mode to call peaks by piling up the actual ChIPed fragments defined by both aligned ends, instead of predicting the fragment size first and extending reads. Also please note that the BEDPE only contains three columns, and is NOT the same BEDPE format used by BEDTOOLS. DEFAULT: \"AUTO\"", + default="AUTO") + group_input.add_argument("-g", "--gsize", dest="gsize", type=str, default="hs", + help="Effective genome size. It can be 1.0e+9 or 1000000000, or shortcuts:'hs' for human (2,913,022,398), 'mm' for mouse (2,652,783,500), 'ce' for C. elegans (100,286,401) and 'dm' for fruitfly (142,573,017), Default:hs. The effective genome size numbers for the above four species are collected from Deeptools https://deeptools.readthedocs.io/en/develop/content/feature/effectiveGenomeSize.html Please refer to deeptools to define the best genome size you plan to use.") + group_input.add_argument("-s", "--tsize", dest="tsize", type=int, default=None, + help="Tag size/read length. This will override the auto detected tag size. DEFAULT: Not set") + group_input.add_argument("--keep-dup", dest="keepduplicates", type=str, default="1", + help="It controls the behavior towards duplicate tags at the exact same location -- the same coordination and the same strand. The 'auto' option makes MACS calculate the maximum tags at the exact same location based on binomal distribution using 1e-5 as pvalue cutoff; and the 'all' option keeps every tags. If an integer is given, at most this number of tags will be kept at the same location. Note, if you've used samtools or picard to flag reads as 'PCR/Optical duplicate' in bit 1024, MACS3 will still read them although the reads may be decided by MACS3 as duplicate later. If you plan to rely on samtools/picard/any other tool to filter duplicates, please remove those duplicate reads and save a new alignment file then ask MACS3 to keep all by '--keep-dup all'. The default is to keep one tag at the same location. Default: 1") # group for output files - group_output = argparser_callpeak.add_argument_group( "Output arguments" ) - add_outdir_option( group_output ) - group_output.add_argument( "-n", "--name", dest = "name", type = str, - help = "Experiment name, which will be used to generate output file names. DEFAULT: \"NA\"", - default = "NA" ) - group_output.add_argument( "-B", "--bdg", dest = "store_bdg", action = "store_true", - help = "Whether or not to save extended fragment pileup, and local lambda tracks (two files) at every bp into a bedGraph file. DEFAULT: False", - default = False ) - group_output.add_argument( "--verbose", dest = "verbose", type = int, default = 2, - help = "Set verbose level of runtime message. 0: only show critical message, 1: show additional warning message, 2: show process information, 3: show debug messages. DEFAULT:2" ) - group_output.add_argument( "--trackline", dest="trackline", action="store_true", default = False, - help = "Instruct MACS to include trackline in the header of output files, including the bedGraph, narrowPeak, gappedPeak, BED format files. To include this trackline is necessary while uploading them to the UCSC genome browser. You can also mannually add these trackline to corresponding output files. For example, in order to upload narrowPeak file to UCSC browser, add this to as the first line -- `track type=narrowPeak name=\"my_peaks\" description=\"my peaks\"`. Default: Not to include trackline." ) - - group_output.add_argument( "--SPMR", dest = "do_SPMR", action = "store_true", default = False, - help = "If True, MACS will SAVE signal per million reads for fragment pileup profiles. It won't interfere with computing pvalue/qvalue during peak calling, since internally MACS3 keeps using the raw pileup and scaling factors between larger and smaller dataset to calculate statistics measurements. If you plan to use the signal output in bedGraph to call peaks using bdgcmp and bdgpeakcall, you shouldn't use this option because you will end up with different results. However, this option is recommended for displaying normalized pileup tracks across many datasets. Require -B to be set. Default: False" ) + group_output = argparser_callpeak.add_argument_group("Output arguments") + add_outdir_option(group_output) + group_output.add_argument("-n", "--name", dest="name", type=str, + help="Experiment name, which will be used to generate output file names. DEFAULT: \"NA\"", + default="NA") + group_output.add_argument("-B", "--bdg", dest="store_bdg", action="store_true", + help="Whether or not to save extended fragment pileup, and local lambda tracks (two files) at every bp into a bedGraph file. DEFAULT: False", + default=False) + group_output.add_argument("--verbose", dest="verbose", type=int, default=2, + help="Set verbose level of runtime message. 0: only show critical message, 1: show additional warning message, 2: show process information, 3: show debug messages. DEFAULT:2") + group_output.add_argument("--trackline", dest="trackline", action="store_true", default=False, + help="Instruct MACS to include trackline in the header of output files, including the bedGraph, narrowPeak, gappedPeak, BED format files. To include this trackline is necessary while uploading them to the UCSC genome browser. You can also mannually add these trackline to corresponding output files. For example, in order to upload narrowPeak file to UCSC browser, add this to as the first line -- `track type=narrowPeak name=\"my_peaks\" description=\"my peaks\"`. Default: Not to include trackline.") + + group_output.add_argument("--SPMR", dest="do_SPMR", action="store_true", default=False, + help="If True, MACS will SAVE signal per million reads for fragment pileup profiles. It won't interfere with computing pvalue/qvalue during peak calling, since internally MACS3 keeps using the raw pileup and scaling factors between larger and smaller dataset to calculate statistics measurements. If you plan to use the signal output in bedGraph to call peaks using bdgcmp and bdgpeakcall, you shouldn't use this option because you will end up with different results. However, this option is recommended for displaying normalized pileup tracks across many datasets. Require -B to be set. Default: False") # group for bimodal - group_bimodal = argparser_callpeak.add_argument_group( "Shifting model arguments" ) - group_bimodal.add_argument( "--nomodel", dest = "nomodel", action = "store_true", default = False, - help = "Whether or not to build the shifting model. If True, MACS will not build model. by default it means shifting size = 100, try to set extsize to change it. It's highly recommended that while you have many datasets to process and you plan to compare different conditions, aka differential calling, use both 'nomodel' and 'extsize' to make signal files from different datasets comparable. DEFAULT: False" ) - group_bimodal.add_argument( "--shift", dest = "shift", type = int, default = 0, - help = "(NOT the legacy --shiftsize option!) The arbitrary shift in bp. Use discretion while setting it other than default value. When NOMODEL is set, MACS will use this value to move cutting ends (5') towards 5'->3' direction then apply EXTSIZE to extend them to fragments. When this value is negative, ends will be moved toward 3'->5' direction. Recommended to keep it as default 0 for ChIP-Seq datasets, or -1 * half of EXTSIZE together with EXTSIZE option for detecting enriched cutting loci such as certain DNAseI-Seq datasets. Note, you can't set values other than 0 if format is BAMPE or BEDPE for paired-end data. DEFAULT: 0. " ) - group_bimodal.add_argument( "--extsize", dest = "extsize", type = int, default = 200, - help = "The arbitrary extension size in bp. When nomodel is true, MACS will use this value as fragment size to extend each read towards 3' end, then pile them up. It's exactly twice the number of obsolete SHIFTSIZE. In previous language, each read is moved 5'->3' direction to middle of fragment by 1/2 d, then extended to both direction with 1/2 d. This is equivalent to say each read is extended towards 5'->3' into a d size fragment. DEFAULT: 200. EXTSIZE and SHIFT can be combined when necessary. Check SHIFT option." ) - group_bimodal.add_argument( "--bw", dest = "bw", type = int, default = 300, - help = "Band width for picking regions to compute fragment size. This value is only used while building the shifting model. Tweaking this is not recommended. DEFAULT: 300") - group_bimodal.add_argument( "--d-min", dest = "d_min", type = int, default = 20, - help = "Minimum fragment size in basepair. Any predicted fragment size less than this will be excluded. DEFAULT: 20") - group_bimodal.add_argument( "-m", "--mfold", dest = "mfold", type = int, default = [5,50], nargs = 2, - help = "Select the regions within MFOLD range of high-confidence enrichment ratio against background to build model. Fold-enrichment in regions must be lower than upper limit, and higher than the lower limit. Use as \"-m 10 30\". This setting is only used while building the shifting model. Tweaking it is not recommended. DEFAULT:5 50" ) - - group_bimodal.add_argument( "--fix-bimodal", dest = "onauto", action = "store_true", - help = "Whether turn on the auto pair model process. If set, when MACS failed to build paired model, it will use the nomodel settings, the --exsize parameter to extend each tags towards 3' direction. Not to use this automate fixation is a default behavior now. DEFAULT: False", - default = False ) + group_bimodal = argparser_callpeak.add_argument_group("Shifting model arguments") + group_bimodal.add_argument("--nomodel", dest="nomodel", action="store_true", default=False, + help="Whether or not to build the shifting model. If True, MACS will not build model. by default it means shifting size = 100, try to set extsize to change it. It's highly recommended that while you have many datasets to process and you plan to compare different conditions, aka differential calling, use both 'nomodel' and 'extsize' to make signal files from different datasets comparable. DEFAULT: False") + group_bimodal.add_argument("--shift", dest="shift", type=int, default=0, + help="(NOT the legacy --shiftsize option!) The arbitrary shift in bp. Use discretion while setting it other than default value. When NOMODEL is set, MACS will use this value to move cutting ends (5') towards 5'->3' direction then apply EXTSIZE to extend them to fragments. When this value is negative, ends will be moved toward 3'->5' direction. Recommended to keep it as default 0 for ChIP-Seq datasets, or -1 * half of EXTSIZE together with EXTSIZE option for detecting enriched cutting loci such as certain DNAseI-Seq datasets. Note, you can't set values other than 0 if format is BAMPE or BEDPE for paired-end data. DEFAULT: 0. ") + group_bimodal.add_argument("--extsize", dest="extsize", type=int, default=200, + help="The arbitrary extension size in bp. When nomodel is true, MACS will use this value as fragment size to extend each read towards 3' end, then pile them up. It's exactly twice the number of obsolete SHIFTSIZE. In previous language, each read is moved 5'->3' direction to middle of fragment by 1/2 d, then extended to both direction with 1/2 d. This is equivalent to say each read is extended towards 5'->3' into a d size fragment. DEFAULT: 200. EXTSIZE and SHIFT can be combined when necessary. Check SHIFT option.") + group_bimodal.add_argument("--bw", dest="bw", type=int, default=300, + help="Band width for picking regions to compute fragment size. This value is only used while building the shifting model. Tweaking this is not recommended. DEFAULT: 300") + group_bimodal.add_argument("--d-min", dest="d_min", type=int, default=20, + help="Minimum fragment size in basepair. Any predicted fragment size less than this will be excluded. DEFAULT: 20") + group_bimodal.add_argument("-m", "--mfold", dest="mfold", type=int, default=[5, 50], nargs=2, + help="Select the regions within MFOLD range of high-confidence enrichment ratio against background to build model. Fold-enrichment in regions must be lower than upper limit, and higher than the lower limit. Use as \"-m 10 30\". This setting is only used while building the shifting model. Tweaking it is not recommended. DEFAULT:5 50") + + group_bimodal.add_argument("--fix-bimodal", dest="onauto", action="store_true", + help="Whether turn on the auto pair model process. If set, when MACS failed to build paired model, it will use the nomodel settings, the --exsize parameter to extend each tags towards 3' direction. Not to use this automate fixation is a default behavior now. DEFAULT: False", + default=False) # General options. - group_callpeak = argparser_callpeak.add_argument_group( "Peak calling arguments" ) + group_callpeak = argparser_callpeak.add_argument_group("Peak calling arguments") p_or_q_group = group_callpeak.add_mutually_exclusive_group() - p_or_q_group.add_argument( "-q", "--qvalue", dest = "qvalue", type = float, default = 0.05, - help = "Minimum FDR (q-value) cutoff for peak detection. DEFAULT: 0.05. -q, and -p are mutually exclusive." ) - p_or_q_group.add_argument( "-p", "--pvalue", dest = "pvalue", type = float, - help = "Pvalue cutoff for peak detection. DEFAULT: not set. -q, and -p are mutually exclusive. If pvalue cutoff is set, qvalue will not be calculated and reported as -1 in the final .xls file." ) + p_or_q_group.add_argument("-q", "--qvalue", dest="qvalue", type=float, default=0.05, + help="Minimum FDR (q-value) cutoff for peak detection. DEFAULT: 0.05. -q, and -p are mutually exclusive.") + p_or_q_group.add_argument("-p", "--pvalue", dest="pvalue", type=float, + help="Pvalue cutoff for peak detection. DEFAULT: not set. -q, and -p are mutually exclusive. If pvalue cutoff is set, qvalue will not be calculated and reported as -1 in the final .xls file.") # about scaling - group_callpeak.add_argument( "--scale-to", dest = "scaleto", type = str, choices = ("large", "small"), - help = "When set to 'small', scale the larger sample up to the smaller sample. When set to 'larger', scale the smaller sample up to the bigger sample. By default, scale to 'small'. This option replaces the obsolete '--to-large' option. The default behavior is recommended since it will lead to less significant p/q-values in general but more specific results. Keep in mind that scaling down will influence control/input sample more. DEFAULT: 'small', the choice is either 'small' or 'large'." ) - group_callpeak.add_argument( "--down-sample", dest = "downsample", action = "store_true", default = False, - help = "When set, random sampling method will scale down the bigger sample. By default, MACS uses linear scaling. Warning: This option will make your result unstable and irreproducible since each time, random reads would be selected. Consider to use 'randsample' script instead. If used together with --SPMR, 1 million unique reads will be randomly picked. Caution: due to the implementation, the final number of selected reads may not be as you expected! DEFAULT: False" ) - group_callpeak.add_argument( "--seed", dest = "seed", type = int, default = -1, - help = "Set the random seed while down sampling data. Must be a non-negative integer in order to be effective. DEFAULT: not set" ) - group_callpeak.add_argument( "--tempdir", dest="tempdir", default=tempfile.gettempdir(), - help = "Optional directory to store temp files. DEFAULT: %(default)s") - group_callpeak.add_argument( "--nolambda", dest = "nolambda", action = "store_true", - help = "If True, MACS will use fixed background lambda as local lambda for every peak region. Normally, MACS calculates a dynamic local lambda to reflect the local bias due to the potential chromatin accessibility. ", - default = False ) - group_callpeak.add_argument( "--slocal", dest = "smalllocal", type = int, default = 1000, - help = "The small nearby region in basepairs to calculate dynamic lambda. This is used to capture the bias near the peak summit region. Invalid if there is no control data. If you set this to 0, MACS will skip slocal lambda calculation. *Note* that MACS will always perform a d-size local lambda calculation while the control data is available. The final local bias would be the maximum of the lambda value from d, slocal, and llocal size windows. While control is not available, d and slocal lambda won't be considered. DEFAULT: 1000 " ) - group_callpeak.add_argument( "--llocal", dest = "largelocal", type = int, default = 10000, - help = "The large nearby region in basepairs to calculate dynamic lambda. This is used to capture the surround bias. If you set this to 0, MACS will skip llocal lambda calculation. *Note* that MACS will always perform a d-size local lambda calculation while the control data is available. The final local bias would be the maximum of the lambda value from d, slocal, and llocal size windows. While control is not available, d and slocal lambda won't be considered. DEFAULT: 10000." ) - group_callpeak.add_argument( "--max-gap", dest = "maxgap", type = int, - help = "Maximum gap between significant sites to cluster them together. The DEFAULT value is the detected read length/tag size." ) - group_callpeak.add_argument( "--min-length", dest = "minlen", type = int, - help = "Minimum length of a peak. The DEFAULT value is the predicted fragment size d. Note, if you set a value smaller than the fragment size, it may have NO effect on the result. For BROAD peak calling, try to set a large value such as 500bps. You can also use '--cutoff-analysis' option with default setting, and check the column 'avelpeak' under different cutoff values to decide a reasonable minlen value." ) - group_callpeak.add_argument( "--broad", dest = "broad", action = "store_true", - help = "If set, MACS will try to call broad peaks using the --broad-cutoff setting. Please tweak '--broad-cutoff' setting to control the peak calling behavior. At the meantime, either -q or -p cutoff will be used to define regions with 'stronger enrichment' inside of broad peaks. The maximum gap is expanded to 4 * MAXGAP (--max-gap parameter). As a result, MACS will output a 'gappedPeak' and a 'broadPeak' file instead of 'narrowPeak' file. Note, a broad peak will be reported even if there is no 'stronger enrichment' inside. DEFAULT: False", default = False ) - group_callpeak.add_argument( "--broad-cutoff", dest = "broadcutoff", type = float, default = 0.1, - help = "Cutoff for broad region. This option is not available unless --broad is set. If -p is set, this is a pvalue cutoff, otherwise, it's a qvalue cutoff. Please note that in broad peakcalling mode, MACS3 uses this setting to control the overall peak calling behavior, then uses -q or -p setting to define regions inside broad region as 'stronger' enrichment. DEFAULT: 0.1 " ) - group_callpeak.add_argument( "--cutoff-analysis", dest="cutoff_analysis", action="store_true", - help = "While set, MACS3 will analyze number or total length of peaks that can be called by different p-value cutoff then output a summary table to help user decide a better cutoff. The table will be saved in NAME_cutoff_analysis.txt file. Note, minlen and maxgap may affect the results. WARNING: May take ~30 folds longer time to finish. The result can be useful for users to decide a reasonable cutoff value. DEFAULT: False", default = False ) + group_callpeak.add_argument("--scale-to", dest="scaleto", type=str, choices=("large", "small"), + help="When set to 'small', scale the larger sample up to the smaller sample. When set to 'larger', scale the smaller sample up to the bigger sample. By default, scale to 'small'. This option replaces the obsolete '--to-large' option. The default behavior is recommended since it will lead to less significant p/q-values in general but more specific results. Keep in mind that scaling down will influence control/input sample more. DEFAULT: 'small', the choice is either 'small' or 'large'.") + group_callpeak.add_argument("--down-sample", dest="downsample", action="store_true", default=False, + help="When set, random sampling method will scale down the bigger sample. By default, MACS uses linear scaling. Warning: This option will make your result unstable and irreproducible since each time, random reads would be selected. Consider to use 'randsample' script instead. If used together with --SPMR, 1 million unique reads will be randomly picked. Caution: due to the implementation, the final number of selected reads may not be as you expected! DEFAULT: False") + group_callpeak.add_argument("--seed", dest="seed", type=int, default=-1, + help="Set the random seed while down sampling data. Must be a non-negative integer in order to be effective. DEFAULT: not set") + group_callpeak.add_argument("--tempdir", dest="tempdir", default=tempfile.gettempdir(), + help="Optional directory to store temp files. DEFAULT: %(default)s") + group_callpeak.add_argument("--nolambda", dest="nolambda", action="store_true", + help="If True, MACS will use fixed background lambda as local lambda for every peak region. Normally, MACS calculates a dynamic local lambda to reflect the local bias due to the potential chromatin accessibility. ", + default=False) + group_callpeak.add_argument("--slocal", dest="smalllocal", type=int, default=1000, + help="The small nearby region in basepairs to calculate dynamic lambda. This is used to capture the bias near the peak summit region. Invalid if there is no control data. If you set this to 0, MACS will skip slocal lambda calculation. *Note* that MACS will always perform a d-size local lambda calculation while the control data is available. The final local bias would be the maximum of the lambda value from d, slocal, and llocal size windows. While control is not available, d and slocal lambda won't be considered. DEFAULT: 1000 ") + group_callpeak.add_argument("--llocal", dest="largelocal", type=int, default=10000, + help="The large nearby region in basepairs to calculate dynamic lambda. This is used to capture the surround bias. If you set this to 0, MACS will skip llocal lambda calculation. *Note* that MACS will always perform a d-size local lambda calculation while the control data is available. The final local bias would be the maximum of the lambda value from d, slocal, and llocal size windows. While control is not available, d and slocal lambda won't be considered. DEFAULT: 10000.") + group_callpeak.add_argument("--max-gap", dest="maxgap", type=int, + help="Maximum gap between significant sites to cluster them together. The DEFAULT value is the detected read length/tag size.") + group_callpeak.add_argument("--min-length", dest="minlen", type=int, + help="Minimum length of a peak. The DEFAULT value is the predicted fragment size d. Note, if you set a value smaller than the fragment size, it may have NO effect on the result. For BROAD peak calling, try to set a large value such as 500bps. You can also use '--cutoff-analysis' option with default setting, and check the column 'avelpeak' under different cutoff values to decide a reasonable minlen value.") + group_callpeak.add_argument("--broad", dest="broad", action="store_true", + help="If set, MACS will try to call broad peaks using the --broad-cutoff setting. Please tweak '--broad-cutoff' setting to control the peak calling behavior. At the meantime, either -q or -p cutoff will be used to define regions with 'stronger enrichment' inside of broad peaks. The maximum gap is expanded to 4 * MAXGAP (--max-gap parameter). As a result, MACS will output a 'gappedPeak' and a 'broadPeak' file instead of 'narrowPeak' file. Note, a broad peak will be reported even if there is no 'stronger enrichment' inside. DEFAULT: False", default=False) + group_callpeak.add_argument("--broad-cutoff", dest="broadcutoff", type=float, default=0.1, + help="Cutoff for broad region. This option is not available unless --broad is set. If -p is set, this is a pvalue cutoff, otherwise, it's a qvalue cutoff. Please note that in broad peakcalling mode, MACS3 uses this setting to control the overall peak calling behavior, then uses -q or -p setting to define regions inside broad region as 'stronger' enrichment. DEFAULT: 0.1 ") + group_callpeak.add_argument("--cutoff-analysis", dest="cutoff_analysis", action="store_true", + help="While set, MACS3 will analyze number or total length of peaks that can be called by different p-value cutoff then output a summary table to help user decide a better cutoff. The table will be saved in NAME_cutoff_analysis.txt file. Note, minlen and maxgap may affect the results. WARNING: May take ~30 folds longer time to finish. The result can be useful for users to decide a reasonable cutoff value. DEFAULT: False", default=False) # post-processing options - group_postprocessing = argparser_callpeak.add_argument_group( "Post-processing options" ) + group_postprocessing = argparser_callpeak.add_argument_group("Post-processing options") postprocess_group = group_postprocessing.add_mutually_exclusive_group() - postprocess_group.add_argument( "--call-summits", dest="call_summits", action="store_true", - help="If set, MACS will use a more sophisticated signal processing approach to find subpeak summits in each enriched peak region. DEFAULT: False",default=False) - group_postprocessing.add_argument( "--fe-cutoff", dest="fecutoff", type=float, default = 1.0, - help = "When set, the value will be used as the minimum requirement to filter out peaks with low fold-enrichment. Note, MACS3 adds one as pseudocount while calculating fold-enrichment. By default, it is set as 1 so there is no filtering. DEFAULT: 1.0") + postprocess_group.add_argument("--call-summits", dest="call_summits", action="store_true", + help="If set, MACS will use a more sophisticated signal processing approach to find subpeak summits in each enriched peak region. DEFAULT: False", default=False) + group_postprocessing.add_argument("--fe-cutoff", dest="fecutoff", type=float, default=1.0, + help="When set, the value will be used as the minimum requirement to filter out peaks with low fold-enrichment. Note, MACS3 adds one as pseudocount while calculating fold-enrichment. By default, it is set as 1 so there is no filtering. DEFAULT: 1.0") # obsolete options - group_obsolete = argparser_callpeak.add_argument_group( "Obsolete options" ) - group_obsolete.add_argument( "--to-large", dest = "tolarge", action = "store_true", default = False, - help = "Obsolete option. Please use '--scale-to large' instead." ) - group_obsolete.add_argument( "--ratio", dest = "ratio", type = float, default = 1.0, - help = "Obsolete option. Originally designed to normalize treatment and control with customized ratio, now it won't have any effect." ) + group_obsolete = argparser_callpeak.add_argument_group("Obsolete options") + group_obsolete.add_argument("--to-large", dest="tolarge", action="store_true", default=False, + help="Obsolete option. Please use '--scale-to large' instead.") + group_obsolete.add_argument("--ratio", dest="ratio", type=float, default=1.0, + help="Obsolete option. Originally designed to normalize treatment and control with customized ratio, now it won't have any effect.") # other options - group_other = argparser_callpeak.add_argument_group( "Other options" ) - group_other.add_argument( "--buffer-size", dest = "buffer_size", type = int, default = "100000", - help = "Buffer size for incrementally increasing internal array size to store reads alignment information. In most cases, you don't have to change this parameter. However, if there are large number of chromosomes/contigs/scaffolds in your alignment, it's recommended to specify a smaller buffer size in order to decrease memory usage (but it will take longer time to read alignment files). Minimum memory requested for reading an alignment file is about # of CHROMOSOME * BUFFER_SIZE * 8 Bytes. DEFAULT: 100000 " ) + group_other = argparser_callpeak.add_argument_group("Other options") + group_other.add_argument("--buffer-size", dest="buffer_size", type=int, default="100000", + help="Buffer size for incrementally increasing internal array size to store reads alignment information. In most cases, you don't have to change this parameter. However, if there are large number of chromosomes/contigs/scaffolds in your alignment, it's recommended to specify a smaller buffer size in order to decrease memory usage (but it will take longer time to read alignment files). Minimum memory requested for reading an alignment file is about # of CHROMOSOME * BUFFER_SIZE * 8 Bytes. DEFAULT: 100000 ") return -def add_filterdup_parser( subparsers ): - argparser_filterdup = subparsers.add_parser( "filterdup", - help = "Remove duplicate reads, then save in BED/BEDPE format file." ) - argparser_filterdup.add_argument( "-i", "--ifile", dest = "ifile", type = str, required = True, nargs = "+", - help = "Alignment file. If multiple files are given as '-t A B C', then they will all be read and combined. REQUIRED." ) - argparser_filterdup.add_argument( "-f", "--format", dest = "format", type = str, - choices=("AUTO","BAM","SAM","BED","ELAND","ELANDMULTI","ELANDEXPORT","BOWTIE","BAMPE","BEDPE"), - help = "Format of tag file, \"AUTO\", \"BED\" or \"ELAND\" or \"ELANDMULTI\" or \"ELANDEXPORT\" or \"SAM\" or \"BAM\" or \"BOWTIE\" or \"BAMPE\" or \"BEDPE\". The default AUTO option will let '%(prog)s' decide which format the file is. Please check the definition in README file if you choose ELAND/ELANDMULTI/ELANDEXPORT/SAM/BAM/BOWTIE or BAMPE/BEDPE. DEFAULT: \"AUTO\"", - default = "AUTO" ) - argparser_filterdup.add_argument( "-g", "--gsize", dest = "gsize", type = str, default = "hs", - help = "Effective genome size. It can be 1.0e+9 or 1000000000, or shortcuts:'hs' for human (2,913,022,398), 'mm' for mouse (2,652,783,500), 'ce' for C. elegans (100,286,401) and 'dm' for fruitfly (142,573,017), Default:hs. The effective genome size numbers for the above four species are collected from Deeptools https://deeptools.readthedocs.io/en/develop/content/feature/effectiveGenomeSize.html Please refer to deeptools to define the best genome size you plan to use.") - argparser_filterdup.add_argument( "-s", "--tsize", dest = "tsize", type = int, - help = "Tag size. This will override the auto detected tag size. DEFAULT: Not set" ) - argparser_filterdup.add_argument( "-p", "--pvalue", dest = "pvalue", type = float, - help = "Pvalue cutoff for binomial distribution test. DEFAULT:1e-5" ) - argparser_filterdup.add_argument( "--keep-dup", dest = "keepduplicates", type = str, default = "auto", - help = "It controls the '%(prog)s' behavior towards duplicate tags/pairs at the exact same location -- the same coordination and the same strand. The 'auto' option makes '%(prog)s' calculate the maximum tags at the exact same location based on binomal distribution using given -p as pvalue cutoff; and the 'all' option keeps every tags (useful if you only want to convert formats). If an integer is given, at most this number of tags will be kept at the same location. Note, MACS3 callpeak function uses KEEPDUPLICATES=1 as default. Note, if you've used samtools or picard to flag reads as 'PCR/Optical duplicate' in bit 1024, MACS3 will still read them although the reads may be decided by MACS3 as duplicate later. Default: auto" ) - argparser_filterdup.add_argument( "--buffer-size", dest = "buffer_size", type = int, default = "100000", - help = "Buffer size for incrementally increasing internal array size to store reads alignment information. In most cases, you don't have to change this parameter. However, if there are large number of chromosomes/contigs/scaffolds in your alignment, it's recommended to specify a smaller buffer size in order to decrease memory usage (but it will take longer time to read alignment files). Minimum memory requested for reading an alignment file is about # of CHROMOSOME * BUFFER_SIZE * 8 Bytes. DEFAULT: 100000 " ) - - argparser_filterdup.add_argument( "--verbose", dest = "verbose", type = int, default = 2, - help = "Set verbose level. 0: only show critical message, 1: show additional warning message, 2: show process information, 3: show debug messages. If you want to know where are the duplicate reads, use 3. DEFAULT:2" ) - add_outdir_option( argparser_filterdup ) - argparser_filterdup.add_argument( "-o", "--ofile", dest = "outputfile", type = str, - help = "Output BED file name. If not specified, will write to standard output. Note, if the input format is BAMPE or BEDPE, the output will be in BEDPE format. DEFAULT: stdout", - default = "stdout" ) - argparser_filterdup.add_argument( "-d", "--dry-run", dest="dryrun", action="store_true", default=False, - help = "When set, filterdup will only output numbers instead of writing output files, including maximum allowable duplicates, total number of reads before filtering, total number of reads after filtering, and redundant rate. Default: not set" ) + +def add_filterdup_parser(subparsers): + argparser_filterdup = subparsers.add_parser("filterdup", + help="Remove duplicate reads, then save in BED/BEDPE format file.") + argparser_filterdup.add_argument("-i", "--ifile", dest="ifile", type=str, required=True, nargs="+", + help="Alignment file. If multiple files are given as '-t A B C', then they will all be read and combined. REQUIRED.") + argparser_filterdup.add_argument("-f", "--format", dest="format", type=str, + choices=("AUTO", "BAM", "SAM", "BED", "ELAND", "ELANDMULTI", "ELANDEXPORT", "BOWTIE", "BAMPE", "BEDPE"), + help="Format of tag file, \"AUTO\", \"BED\" or \"ELAND\" or \"ELANDMULTI\" or \"ELANDEXPORT\" or \"SAM\" or \"BAM\" or \"BOWTIE\" or \"BAMPE\" or \"BEDPE\". The default AUTO option will let '%(prog)s' decide which format the file is. Please check the definition in README file if you choose ELAND/ELANDMULTI/ELANDEXPORT/SAM/BAM/BOWTIE or BAMPE/BEDPE. DEFAULT: \"AUTO\"", + default="AUTO") + argparser_filterdup.add_argument("-g", "--gsize", dest="gsize", type=str, default="hs", + help="Effective genome size. It can be 1.0e+9 or 1000000000, or shortcuts:'hs' for human (2,913,022,398), 'mm' for mouse (2,652,783,500), 'ce' for C. elegans (100,286,401) and 'dm' for fruitfly (142,573,017), Default:hs. The effective genome size numbers for the above four species are collected from Deeptools https://deeptools.readthedocs.io/en/develop/content/feature/effectiveGenomeSize.html Please refer to deeptools to define the best genome size you plan to use.") + argparser_filterdup.add_argument("-s", "--tsize", dest="tsize", type=int, + help="Tag size. This will override the auto detected tag size. DEFAULT: Not set") + argparser_filterdup.add_argument("-p", "--pvalue", dest="pvalue", type=float, + help="Pvalue cutoff for binomial distribution test. DEFAULT:1e-5") + argparser_filterdup.add_argument("--keep-dup", dest="keepduplicates", type=str, default="auto", + help="It controls the '%(prog)s' behavior towards duplicate tags/pairs at the exact same location -- the same coordination and the same strand. The 'auto' option makes '%(prog)s' calculate the maximum tags at the exact same location based on binomal distribution using given -p as pvalue cutoff; and the 'all' option keeps every tags (useful if you only want to convert formats). If an integer is given, at most this number of tags will be kept at the same location. Note, MACS3 callpeak function uses KEEPDUPLICATES=1 as default. Note, if you've used samtools or picard to flag reads as 'PCR/Optical duplicate' in bit 1024, MACS3 will still read them although the reads may be decided by MACS3 as duplicate later. Default: auto") + argparser_filterdup.add_argument("--buffer-size", dest="buffer_size", type=int, default="100000", + help="Buffer size for incrementally increasing internal array size to store reads alignment information. In most cases, you don't have to change this parameter. However, if there are large number of chromosomes/contigs/scaffolds in your alignment, it's recommended to specify a smaller buffer size in order to decrease memory usage (but it will take longer time to read alignment files). Minimum memory requested for reading an alignment file is about # of CHROMOSOME * BUFFER_SIZE * 8 Bytes. DEFAULT: 100000 ") + argparser_filterdup.add_argument("--verbose", dest="verbose", type=int, default=2, + help="Set verbose level. 0: only show critical message, 1: show additional warning message, 2: show process information, 3: show debug messages. If you want to know where are the duplicate reads, use 3. DEFAULT:2") + add_outdir_option(argparser_filterdup) + argparser_filterdup.add_argument("-o", "--ofile", dest="outputfile", type=str, + help="Output BED file name. If not specified, will write to standard output. Note, if the input format is BAMPE or BEDPE, the output will be in BEDPE format. DEFAULT: stdout", + default="stdout") + argparser_filterdup.add_argument("-d", "--dry-run", dest="dryrun", action="store_true", default=False, + help="When set, filterdup will only output numbers instead of writing output files, including maximum allowable duplicates, total number of reads before filtering, total number of reads after filtering, and redundant rate. Default: not set") return -def add_bdgpeakcall_parser( subparsers ): +def add_bdgpeakcall_parser(subparsers): """Add function 'peak calling on bedGraph' argument parsers. """ - argparser_bdgpeakcall = subparsers.add_parser( "bdgpeakcall", - help = "Call peaks from bedGraph file." ) - argparser_bdgpeakcall.add_argument( "-i", "--ifile", dest = "ifile", type = str, required = True, - help = "MACS score in bedGraph. REQUIRED" ) - argparser_bdgpeakcall.add_argument( "-c", "--cutoff" , dest = "cutoff", type = float, - help = "Cutoff depends on which method you used for score track. If the file contains pvalue scores from MACS3, score 5 means pvalue 1e-5. Regions with signals lower than cutoff will not be considerred as enriched regions. DEFAULT: 5", default = 5 ) - argparser_bdgpeakcall.add_argument( "-l", "--min-length", dest = "minlen", type = int, - help = "minimum length of peak, better to set it as d value. DEFAULT: 200", default = 200 ) - argparser_bdgpeakcall.add_argument( "-g", "--max-gap", dest = "maxgap", type = int, - help = "maximum gap between significant points in a peak, better to set it as tag size. DEFAULT: 30", default = 30 ) - argparser_bdgpeakcall.add_argument( "--call-summits", dest="call_summits", action="store_true", help=ap.SUPPRESS, default=False) -# help="If set, MACS will use a more sophisticated approach to find all summits in each enriched peak region. DEFAULT: False",default=False) - argparser_bdgpeakcall.add_argument( "--cutoff-analysis", dest="cutoff_analysis", action="store_true", - help = "While set, bdgpeakcall will analyze number or total length of peaks that can be called by different cutoff then output a summary table to help user decide a better cutoff. Note, minlen and maxgap may affect the results. DEFAULT: False", default = False ) - argparser_bdgpeakcall.add_argument( "--cutoff-analysis-max", dest="cutoff_analysis_max", type = int, - help = "The maximum cutoff score for performing cutoff analysis. Together with --cutoff-analysis-steps, the resolution in the final report can be controlled. Please check the description in --cutoff-analysis-steps for detail. DEFAULT: 100", - default = 100 ) - argparser_bdgpeakcall.add_argument( "--cutoff-analysis-steps", dest="cutoff_analysis_steps", type = int, - help = "Steps for performing cutoff analysis. It will be used to decide which cutoff value should be included in the final report. Larger the value, higher resolution the cutoff analysis can be. The cutoff analysis function will first find the smallest (at least 0) and the largest (controlled by --cutoff-analysis-max) score in the data, then break the range of score into `CUTOFF_ANALYSIS_STEPS` intervals. It will then use each score as cutoff to call peaks and calculate the total number of candidate peaks, the total basepairs of peaks, and the average length of peak in basepair. Please note that the final report ideally should include `CUTOFF_ANALYSIS_STEPS` rows, but in practice, if the cutoff yield zero peak, the row for that value won't be included. DEFAULT: 100", default = 100 ) + argparser_bdgpeakcall = subparsers.add_parser("bdgpeakcall", + help="Call peaks from bedGraph file.") + argparser_bdgpeakcall.add_argument("-i", "--ifile", dest="ifile", type=str, required=True, + help="MACS score in bedGraph. REQUIRED") + argparser_bdgpeakcall.add_argument("-c", "--cutoff", dest="cutoff", type=float, + help="Cutoff depends on which method you used for score track. If the file contains pvalue scores from MACS3, score 5 means pvalue 1e-5. Regions with signals lower than cutoff will not be considerred as enriched regions. DEFAULT: 5", default=5) + argparser_bdgpeakcall.add_argument("-l", "--min-length", dest="minlen", type=int, + help="minimum length of peak, better to set it as d value. DEFAULT: 200", default=200) + argparser_bdgpeakcall.add_argument("-g", "--max-gap", dest="maxgap", type=int, + help="maximum gap between significant points in a peak, better to set it as tag size. DEFAULT: 30", default=30) + argparser_bdgpeakcall.add_argument("--call-summits", dest="call_summits", action="store_true", help=ap.SUPPRESS, default=False) + # help="If set, MACS will use a more sophisticated approach to find all summits in each enriched peak region. DEFAULT: False",default=False) + argparser_bdgpeakcall.add_argument("--cutoff-analysis", dest="cutoff_analysis", action="store_true", + help="While set, bdgpeakcall will analyze number or total length of peaks that can be called by different cutoff then output a summary table to help user decide a better cutoff. Note, minlen and maxgap may affect the results. DEFAULT: False", default=False) + argparser_bdgpeakcall.add_argument("--cutoff-analysis-max", dest="cutoff_analysis_max", type=int, + help="The maximum cutoff score for performing cutoff analysis. Together with --cutoff-analysis-steps, the resolution in the final report can be controlled. Please check the description in --cutoff-analysis-steps for detail. DEFAULT: 100", + default=100) + argparser_bdgpeakcall.add_argument("--cutoff-analysis-steps", dest="cutoff_analysis_steps", type=int, + help="Steps for performing cutoff analysis. It will be used to decide which cutoff value should be included in the final report. Larger the value, higher resolution the cutoff analysis can be. The cutoff analysis function will first find the smallest (at least 0) and the largest (controlled by --cutoff-analysis-max) score in the data, then break the range of score into `CUTOFF_ANALYSIS_STEPS` intervals. It will then use each score as cutoff to call peaks and calculate the total number of candidate peaks, the total basepairs of peaks, and the average length of peak in basepair. Please note that the final report ideally should include `CUTOFF_ANALYSIS_STEPS` rows, but in practice, if the cutoff yield zero peak, the row for that value won't be included. DEFAULT: 100", default=100) argparser_bdgpeakcall.add_argument("--no-trackline", dest="trackline", action="store_false", default=True, - help="Tells MACS not to include trackline with bedGraph files. The trackline is used by UCSC for the options for display.") - argparser_bdgpeakcall.add_argument( "--verbose", dest = "verbose", type = int, default = 2, - help = "Set verbose level of runtime message. 0: only show critical message, 1: show additional warning message, 2: show process information, 3: show debug messages. DEFAULT:2" ) + help="Tells MACS not to include trackline with bedGraph files. The trackline is used by UCSC for the options for display.") + argparser_bdgpeakcall.add_argument("--verbose", dest="verbose", type=int, default=2, + help="Set verbose level of runtime message. 0: only show critical message, 1: show additional warning message, 2: show process information, 3: show debug messages. DEFAULT:2") - add_outdir_option( argparser_bdgpeakcall ) - add_output_group( argparser_bdgpeakcall ) + add_outdir_option(argparser_bdgpeakcall) + add_output_group(argparser_bdgpeakcall) return -def add_bdgbroadcall_parser( subparsers ): +def add_bdgbroadcall_parser(subparsers): """Add function 'broad peak calling on bedGraph' argument parsers. """ - argparser_bdgbroadcall = subparsers.add_parser( "bdgbroadcall", - help = "Call nested broad peaks from bedGraph file." ) - argparser_bdgbroadcall.add_argument( "-i", "--ifile", dest = "ifile" , type = str, required = True, - help = "MACS score in bedGraph. REQUIRED" ) - argparser_bdgbroadcall.add_argument( "-c", "--cutoff-peak", dest = "cutoffpeak", type = float, - help = "Cutoff for peaks depending on which method you used for score track. If the file contains qvalue scores from MACS3, score 2 means qvalue 0.01. Regions with signals lower than cutoff will not be considerred as enriched regions. DEFAULT: 2", - default = 2 ) - argparser_bdgbroadcall.add_argument( "-C", "--cutoff-link", dest = "cutofflink", type = float, - help = "Cutoff for linking regions/low abundance regions depending on which method you used for score track. If the file contains qvalue scores from MACS3, score 1 means qvalue 0.1, and score 0.3 means qvalue 0.5. DEFAULT: 1", default = 1 ) - argparser_bdgbroadcall.add_argument( "-l", "--min-length", dest = "minlen", type = int, - help = "minimum length of peak, better to set it as d value. DEFAULT: 200", default = 200 ) - argparser_bdgbroadcall.add_argument( "-g", "--lvl1-max-gap", dest = "lvl1maxgap", type = int, - help = "maximum gap between significant peaks, better to set it as tag size. DEFAULT: 30", default = 30 ) - argparser_bdgbroadcall.add_argument( "-G", "--lvl2-max-gap", dest = "lvl2maxgap", type = int, - help = "maximum linking between significant peaks, better to set it as 4 times of d value. DEFAULT: 800", default = 800) + argparser_bdgbroadcall = subparsers.add_parser("bdgbroadcall", + help="Call nested broad peaks from bedGraph file.") + argparser_bdgbroadcall.add_argument("-i", "--ifile", dest="ifile", type=str, required=True, + help="MACS score in bedGraph. REQUIRED") + argparser_bdgbroadcall.add_argument("-c", "--cutoff-peak", dest="cutoffpeak", type=float, + help="Cutoff for peaks depending on which method you used for score track. If the file contains qvalue scores from MACS3, score 2 means qvalue 0.01. Regions with signals lower than cutoff will not be considerred as enriched regions. DEFAULT: 2", + default=2) + argparser_bdgbroadcall.add_argument("-C", "--cutoff-link", dest="cutofflink", type=float, + help="Cutoff for linking regions/low abundance regions depending on which method you used for score track. If the file contains qvalue scores from MACS3, score 1 means qvalue 0.1, and score 0.3 means qvalue 0.5. DEFAULT: 1", default=1) + argparser_bdgbroadcall.add_argument("-l", "--min-length", dest="minlen", type=int, + help="minimum length of peak, better to set it as d value. DEFAULT: 200", default=200) + argparser_bdgbroadcall.add_argument("-g", "--lvl1-max-gap", dest="lvl1maxgap", type=int, + help="maximum gap between significant peaks, better to set it as tag size. DEFAULT: 30", default=30) + argparser_bdgbroadcall.add_argument("-G", "--lvl2-max-gap", dest="lvl2maxgap", type=int, + help="maximum linking between significant peaks, better to set it as 4 times of d value. DEFAULT: 800", default=800) argparser_bdgbroadcall.add_argument("--no-trackline", dest="trackline", action="store_false", default=True, - help="Tells MACS not to include trackline with bedGraph files. The trackline is required by UCSC.") - argparser_bdgbroadcall.add_argument( "--verbose", dest = "verbose", type = int, default = 2, - help = "Set verbose level of runtime message. 0: only show critical message, 1: show additional warning message, 2: show process information, 3: show debug messages. DEFAULT:2" ) - add_outdir_option( argparser_bdgbroadcall ) - add_output_group( argparser_bdgbroadcall ) + help="Tells MACS not to include trackline with bedGraph files. The trackline is required by UCSC.") + argparser_bdgbroadcall.add_argument("--verbose", dest="verbose", type=int, default=2, + help="Set verbose level of runtime message. 0: only show critical message, 1: show additional warning message, 2: show process information, 3: show debug messages. DEFAULT:2") + add_outdir_option(argparser_bdgbroadcall) + add_output_group(argparser_bdgbroadcall) return -def add_bdgcmp_parser( subparsers ): +def add_bdgcmp_parser(subparsers): """Add function 'peak calling on bedGraph' argument parsers. """ - argparser_bdgcmp = subparsers.add_parser( "bdgcmp", - help = "Comparing two signal tracks in bedGraph format." ) - argparser_bdgcmp.add_argument( "-t", "--tfile", dest = "tfile", type = str, required = True, - help = "Treatment bedGraph file, e.g. *_treat_pileup.bdg from MACSv2. REQUIRED") - argparser_bdgcmp.add_argument( "-c", "--cfile", dest = "cfile", type = str, required = True, - help = "Control bedGraph file, e.g. *_control_lambda.bdg from MACSv2. REQUIRED") - argparser_bdgcmp.add_argument( "-S", "--scaling-factor", dest = "sfactor", type = float, default = 1.0, - help = "Scaling factor for treatment and control track. Keep it as 1.0 or default in most cases. Set it ONLY while you have SPMR output from MACS3 callpeak, and plan to calculate scores as MACS3 callpeak module. If you want to simulate 'callpeak' w/o '--to-large', calculate effective smaller sample size after filtering redudant reads in million (e.g., put 31.415926 if effective reads are 31,415,926) and input it for '-S'; for 'callpeak --to-large', calculate effective reads in larger sample. DEFAULT: 1.0") - argparser_bdgcmp.add_argument( "-p", "--pseudocount", dest = "pseudocount", type = float, default = 0.0, - help = "The pseudocount used for calculating logLR, logFE or FE. The count will be applied after normalization of sequencing depth. DEFAULT: 0.0, no pseudocount is applied.") - - argparser_bdgcmp.add_argument( "-m", "--method", dest = "method", type = str, nargs = "+", - choices = ( "ppois", "qpois", "subtract", "logFE", "FE", "logLR", "slogLR", "max" ), - help = "Method to use while calculating a score in any bin by comparing treatment value and control value. Available choices are: ppois, qpois, subtract, logFE, FE, logLR, slogLR, and max. They represent Poisson Pvalue (-log10(pvalue) form) using control as lambda and treatment as observation, q-value through a BH process for poisson pvalues, subtraction from treatment, linear scale fold enrichment, log10 fold enrichment(need to set pseudocount), log10 likelihood between ChIP-enriched model and open chromatin model(need to set pseudocount), symmetric log10 likelihood between two ChIP-enrichment models, or maximum value between the two tracks. Default option is ppois.",default="ppois") - argparser_bdgcmp.add_argument( "--verbose", dest = "verbose", type = int, default = 2, - help = "Set verbose level of runtime message. 0: only show critical message, 1: show additional warning message, 2: show process information, 3: show debug messages. DEFAULT:2" ) - - add_outdir_option( argparser_bdgcmp ) - output_group = argparser_bdgcmp.add_mutually_exclusive_group( required = True ) - output_group.add_argument( "--o-prefix", dest = "oprefix", type = str, - help = "The PREFIX of output bedGraph file to write scores. If it is given as A, and method is 'ppois', output file will be A_ppois.bdg. Mutually exclusive with -o/--ofile." ) - output_group.add_argument( "-o", "--ofile", dest = "ofile", type = str, nargs = "+", - help = "Output filename. Mutually exclusive with --o-prefix. The number and the order of arguments for --ofile must be the same as for -m." ) + argparser_bdgcmp = subparsers.add_parser("bdgcmp", + help="Comparing two signal tracks in bedGraph format.") + argparser_bdgcmp.add_argument("-t", "--tfile", dest="tfile", type=str, required=True, + help="Treatment bedGraph file, e.g. *_treat_pileup.bdg from MACSv2. REQUIRED") + argparser_bdgcmp.add_argument("-c", "--cfile", dest="cfile", type=str, required=True, + help="Control bedGraph file, e.g. *_control_lambda.bdg from MACSv2. REQUIRED") + argparser_bdgcmp.add_argument("-S", "--scaling-factor", dest="sfactor", type=float, default=1.0, + help="Scaling factor for treatment and control track. Keep it as 1.0 or default in most cases. Set it ONLY while you have SPMR output from MACS3 callpeak, and plan to calculate scores as MACS3 callpeak module. If you want to simulate 'callpeak' w/o '--to-large', calculate effective smaller sample size after filtering redudant reads in million (e.g., put 31.415926 if effective reads are 31,415,926) and input it for '-S'; for 'callpeak --to-large', calculate effective reads in larger sample. DEFAULT: 1.0") + argparser_bdgcmp.add_argument("-p", "--pseudocount", dest="pseudocount", type=float, default=0.0, + help="The pseudocount used for calculating logLR, logFE or FE. The count will be applied after normalization of sequencing depth. DEFAULT: 0.0, no pseudocount is applied.") + + argparser_bdgcmp.add_argument("-m", "--method", dest="method", type=str, nargs="+", + choices=("ppois", "qpois", "subtract", "logFE", "FE", "logLR", "slogLR", "max"), + help="Method to use while calculating a score in any bin by comparing treatment value and control value. Available choices are: ppois, qpois, subtract, logFE, FE, logLR, slogLR, and max. They represent Poisson Pvalue (-log10(pvalue) form) using control as lambda and treatment as observation, q-value through a BH process for poisson pvalues, subtraction from treatment, linear scale fold enrichment, log10 fold enrichment(need to set pseudocount), log10 likelihood between ChIP-enriched model and open chromatin model(need to set pseudocount), symmetric log10 likelihood between two ChIP-enrichment models, or maximum value between the two tracks. Default option is ppois.",default="ppois") + argparser_bdgcmp.add_argument("--verbose", dest="verbose", type=int, default=2, + help="Set verbose level of runtime message. 0: only show critical message, 1: show additional warning message, 2: show process information, 3: show debug messages. DEFAULT:2") + + add_outdir_option(argparser_bdgcmp) + output_group = argparser_bdgcmp.add_mutually_exclusive_group(required=True) + output_group.add_argument("--o-prefix", dest="oprefix", type=str, + help="The PREFIX of output bedGraph file to write scores. If it is given as A, and method is 'ppois', output file will be A_ppois.bdg. Mutually exclusive with -o/--ofile.") + output_group.add_argument("-o", "--ofile", dest="ofile", type=str, nargs="+", + help="Output filename. Mutually exclusive with --o-prefix. The number and the order of arguments for --ofile must be the same as for -m.") return -def add_bdgopt_parser( subparsers ): +def add_bdgopt_parser(subparsers): """Add function 'operations on score column of bedGraph' argument parsers. """ - argparser_bdgopt = subparsers.add_parser( "bdgopt", - help = "Operate the score column of bedGraph file." ) - argparser_bdgopt.add_argument( "-i", "--ifile", dest = "ifile", type = str, required = True, - help = "MACS score in bedGraph. Note: this must be a bedGraph file covering the ENTIRE genome. REQUIRED" ) - argparser_bdgopt.add_argument( "-m", "--method", dest = "method", type = str, - choices = ( "multiply", "add", "p2q", "max", "min" ), - help = "Method to modify the score column of bedGraph file. Available choices are: multiply, add, max, min, or p2q. 1) multiply, the EXTRAPARAM is required and will be multiplied to the score column. If you intend to divide the score column by X, use value of 1/X as EXTRAPARAM. 2) add, the EXTRAPARAM is required and will be added to the score column. If you intend to subtract the score column by X, use value of -X as EXTRAPARAM. 3) max, the EXTRAPARAM is required and will take the maximum value between score and the EXTRAPARAM. 4) min, the EXTRAPARAM is required and will take the minimum value between score and the EXTRAPARAM. 5) p2q, this will convert p-value scores to q-value scores using Benjamini-Hochberg process. The EXTRAPARAM is not required. This method assumes the scores are -log10 p-value from MACS3. Any other types of score will cause unexpected errors.", default="p2q") - argparser_bdgopt.add_argument( "-p", "--extra-param", dest = "extraparam", type = float, nargs = "*", - help = "The extra parameter for METHOD. Check the detail of -m option.") - add_outdir_option( argparser_bdgopt ) - argparser_bdgopt.add_argument( "-o", "--ofile", dest = "ofile", type = str, - help = "Output BEDGraph filename.", required = True ) - argparser_bdgopt.add_argument( "--verbose", dest = "verbose", type = int, default = 2, - help = "Set verbose level of runtime message. 0: only show critical message, 1: show additional warning message, 2: show process information, 3: show debug messages. DEFAULT:2" ) + argparser_bdgopt = subparsers.add_parser("bdgopt", + help="Operate the score column of bedGraph file.") + argparser_bdgopt.add_argument("-i", "--ifile", dest="ifile", type=str, required=True, + help="MACS score in bedGraph. Note: this must be a bedGraph file covering the ENTIRE genome. REQUIRED") + argparser_bdgopt.add_argument("-m", "--method", dest="method", type=str, + choices=("multiply", "add", "p2q", "max", "min"), + help="Method to modify the score column of bedGraph file. Available choices are: multiply, add, max, min, or p2q. 1) multiply, the EXTRAPARAM is required and will be multiplied to the score column. If you intend to divide the score column by X, use value of 1/X as EXTRAPARAM. 2) add, the EXTRAPARAM is required and will be added to the score column. If you intend to subtract the score column by X, use value of -X as EXTRAPARAM. 3) max, the EXTRAPARAM is required and will take the maximum value between score and the EXTRAPARAM. 4) min, the EXTRAPARAM is required and will take the minimum value between score and the EXTRAPARAM. 5) p2q, this will convert p-value scores to q-value scores using Benjamini-Hochberg process. The EXTRAPARAM is not required. This method assumes the scores are -log10 p-value from MACS3. Any other types of score will cause unexpected errors.", default="p2q") + argparser_bdgopt.add_argument("-p", "--extra-param", dest="extraparam", type=float, nargs="*", + help="The extra parameter for METHOD. Check the detail of -m option.") + add_outdir_option(argparser_bdgopt) + argparser_bdgopt.add_argument("-o", "--ofile", dest="ofile", type=str, + help="Output BEDGraph filename.", required=True) + argparser_bdgopt.add_argument("--verbose", dest="verbose", type=int, default=2, + help="Set verbose level of runtime message. 0: only show critical message, 1: show additional warning message, 2: show process information, 3: show debug messages. DEFAULT:2") return -def add_cmbreps_parser( subparsers ): + +def add_cmbreps_parser(subparsers): """Add function 'combine replicates' argument parsers. """ - argparser_cmbreps = subparsers.add_parser( "cmbreps", - help = "Combine bedGraph files of scores from replicates." ) - argparser_cmbreps.add_argument( "-i", dest = "ifile", type = str, required = True, nargs = "+", - help = "MACS score in bedGraph for each replicate. Require at least 2 files such as '-i A B C D'. REQUIRED" ) - # argparser_cmbreps.add_argument( "-w", dest = "weights", type = float, nargs = "*", - # help = "Weight for each replicate. Default is 1.0 for each. When given, require same number of parameters as IFILE." ) - argparser_cmbreps.add_argument( "-m", "--method", dest = "method", type = str, - choices = ( "fisher", "max", "mean" ), - help = "Method to use while combining scores from replicates. 1) fisher: Fisher's combined probability test. It requires scores in ppois form (-log10 pvalues) from bdgcmp. Other types of scores for this method may cause cmbreps unexpected errors. 2) max: take the maximum value from replicates for each genomic position. 3) mean: take the average value. Note, except for Fisher's method, max or mean will take scores AS IS which means they won't convert scores from log scale to linear scale or vice versa.", default="fisher") - add_outdir_option( argparser_cmbreps ) - argparser_cmbreps.add_argument( "-o", "--ofile", dest = "ofile", type = str, required = True, - help = "Output BEDGraph filename for combined scores." ) - argparser_cmbreps.add_argument( "--verbose", dest = "verbose", type = int, default = 2, - help = "Set verbose level of runtime message. 0: only show critical message, 1: show additional warning message, 2: show process information, 3: show debug messages. DEFAULT:2" ) + argparser_cmbreps = subparsers.add_parser("cmbreps", + help="Combine bedGraph files of scores from replicates.") + argparser_cmbreps.add_argument("-i", dest="ifile", type=str, required=True, nargs="+", + help="MACS score in bedGraph for each replicate. Require at least 2 files such as '-i A B C D'. REQUIRED") + # argparser_cmbreps.add_argument("-w", dest="weights", type=float, nargs="*", + # help="Weight for each replicate. Default is 1.0 for each. When given, require same number of parameters as IFILE.") + argparser_cmbreps.add_argument("-m", "--method", dest="method", type=str, + choices=("fisher", "max", "mean"), + help="Method to use while combining scores from replicates. 1) fisher: Fisher's combined probability test. It requires scores in ppois form (-log10 pvalues) from bdgcmp. Other types of scores for this method may cause cmbreps unexpected errors. 2) max: take the maximum value from replicates for each genomic position. 3) mean: take the average value. Note, except for Fisher's method, max or mean will take scores AS IS which means they won't convert scores from log scale to linear scale or vice versa.", default="fisher") + add_outdir_option(argparser_cmbreps) + argparser_cmbreps.add_argument("-o", "--ofile", dest="ofile", type=str, required=True, + help="Output BEDGraph filename for combined scores.") + argparser_cmbreps.add_argument("--verbose", dest="verbose", type=int, default=2, + help="Set verbose level of runtime message. 0: only show critical message, 1: show additional warning message, 2: show process information, 3: show debug messages. DEFAULT:2") return -def add_randsample_parser( subparsers ): - argparser_randsample = subparsers.add_parser( "randsample", - help = "Randomly choose a number/percentage of total reads, then save in BED/BEDPE format file." ) - argparser_randsample.add_argument( "-i", "--ifile", dest = "ifile", type = str, required = True, nargs = "+", - help = "Alignment file. If multiple files are given as '-t A B C', then they will all be read and combined. REQUIRED." ) - - p_or_n_group = argparser_randsample.add_mutually_exclusive_group( required = True ) - p_or_n_group.add_argument( "-p", "--percentage", dest = "percentage", type = float, - help = "Percentage of tags you want to keep. Input 80.0 for 80%%. This option can't be used at the same time with -n/--num. If the setting is 100, it will keep all the reads and convert any format that MACS3 supports into BED or BEDPE (if input is BAMPE) format. REQUIRED") - p_or_n_group.add_argument( "-n", "--number", dest = "number", type = float, - help = "Number of tags you want to keep. Input 8000000 or 8e+6 for 8 million. This option can't be used at the same time with -p/--percent. Note that the number of tags in output is approximate as the number specified here. REQUIRED" ) - argparser_randsample.add_argument( "--seed", dest = "seed", type = int, default = -1, - help = "Set the random seed while down sampling data. Must be a non-negative integer in order to be effective. If you want more reproducible results, please specify a random seed and record it.DEFAULT: not set" ) - argparser_randsample.add_argument( "-o", "--ofile", dest = "outputfile", type = str, - help = "Output BED file name. If not specified, will write to standard output. Note, if the input format is BAMPE or BEDPE, the output will be in BEDPE format. DEFAULT: stdout", - default = None) - add_outdir_option( argparser_randsample ) - argparser_randsample.add_argument( "-s", "--tsize", dest = "tsize", type = int, default = None, - help = "Tag size. This will override the auto detected tag size. DEFAULT: Not set") - argparser_randsample.add_argument( "-f", "--format", dest = "format", type = str, - choices=("AUTO","BAM","SAM","BED","ELAND","ELANDMULTI","ELANDEXPORT","BOWTIE","BAMPE","BEDPE"), - help = "Format of tag file, \"AUTO\", \"BED\" or \"ELAND\" or \"ELANDMULTI\" or \"ELANDEXPORT\" or \"SAM\" or \"BAM\" or \"BOWTIE\" or \"BAMPE\" or \"BEDPE\". The default AUTO option will %(prog)s decide which format the file is. Please check the definition in README file if you choose ELAND/ELANDMULTI/ELANDEXPORT/SAM/BAM/BOWTIE or BAMPE/BEDPE. DEFAULT: \"AUTO\"", - default = "AUTO" ) - argparser_randsample.add_argument( "--buffer-size", dest = "buffer_size", type = int, default = "100000", - help = "Buffer size for incrementally increasing internal array size to store reads alignment information. In most cases, you don't have to change this parameter. However, if there are large number of chromosomes/contigs/scaffolds in your alignment, it's recommended to specify a smaller buffer size in order to decrease memory usage (but it will take longer time to read alignment files). Minimum memory requested for reading an alignment file is about # of CHROMOSOME * BUFFER_SIZE * 8 Bytes. DEFAULT: 100000 " ) - - argparser_randsample.add_argument( "--verbose", dest = "verbose", type = int, default = 2, - help = "Set verbose level. 0: only show critical message, 1: show additional warning message, 2: show process information, 3: show debug messages. If you want to know where are the duplicate reads, use 3. DEFAULT:2" ) + +def add_randsample_parser(subparsers): + argparser_randsample = subparsers.add_parser("randsample", + help="Randomly choose a number/percentage of total reads, then save in BED/BEDPE format file.") + argparser_randsample.add_argument("-i", "--ifile", dest="ifile", type=str, required=True, nargs="+", + help="Alignment file. If multiple files are given as '-t A B C', then they will all be read and combined. REQUIRED.") + + p_or_n_group = argparser_randsample.add_mutually_exclusive_group(required=True) + p_or_n_group.add_argument("-p", "--percentage", dest="percentage", type=float, + help="Percentage of tags you want to keep. Input 80.0 for 80%%. This option can't be used at the same time with -n/--num. If the setting is 100, it will keep all the reads and convert any format that MACS3 supports into BED or BEDPE (if input is BAMPE) format. REQUIRED") + p_or_n_group.add_argument("-n", "--number", dest="number", type=float, + help="Number of tags you want to keep. Input 8000000 or 8e+6 for 8 million. This option can't be used at the same time with -p/--percent. Note that the number of tags in output is approximate as the number specified here. REQUIRED") + argparser_randsample.add_argument("--seed", dest="seed", type=int, default=-1, + help="Set the random seed while down sampling data. Must be a non-negative integer in order to be effective. If you want more reproducible results, please specify a random seed and record it.DEFAULT: not set") + argparser_randsample.add_argument("-o", "--ofile", dest="outputfile", type=str, + help="Output BED file name. If not specified, will write to standard output. Note, if the input format is BAMPE or BEDPE, the output will be in BEDPE format. DEFAULT: stdout", + default=None) + add_outdir_option(argparser_randsample) + argparser_randsample.add_argument("-s", "--tsize", dest="tsize", type=int, default=None, + help="Tag size. This will override the auto detected tag size. DEFAULT: Not set") + argparser_randsample.add_argument("-f", "--format", dest="format", type=str, + choices=("AUTO", "BAM", "SAM", "BED", "ELAND", "ELANDMULTI", "ELANDEXPORT", "BOWTIE", "BAMPE", "BEDPE"), + help="Format of tag file, \"AUTO\", \"BED\" or \"ELAND\" or \"ELANDMULTI\" or \"ELANDEXPORT\" or \"SAM\" or \"BAM\" or \"BOWTIE\" or \"BAMPE\" or \"BEDPE\". The default AUTO option will %(prog)s decide which format the file is. Please check the definition in README file if you choose ELAND/ELANDMULTI/ELANDEXPORT/SAM/BAM/BOWTIE or BAMPE/BEDPE. DEFAULT: \"AUTO\"", + default="AUTO") + argparser_randsample.add_argument("--buffer-size", dest="buffer_size", type=int, default="100000", + help="Buffer size for incrementally increasing internal array size to store reads alignment information. In most cases, you don't have to change this parameter. However, if there are large number of chromosomes/contigs/scaffolds in your alignment, it's recommended to specify a smaller buffer size in order to decrease memory usage (but it will take longer time to read alignment files). Minimum memory requested for reading an alignment file is about # of CHROMOSOME * BUFFER_SIZE * 8 Bytes. DEFAULT: 100000 ") + + argparser_randsample.add_argument("--verbose", dest="verbose", type=int, default=2, + help="Set verbose level. 0: only show critical message, 1: show additional warning message, 2: show process information, 3: show debug messages. If you want to know where are the duplicate reads, use 3. DEFAULT:2") return -def add_bdgdiff_parser( subparsers ): - argparser_bdgdiff = subparsers.add_parser( "bdgdiff", - help = "Differential peak detection based on paired four bedGraph files." ) - argparser_bdgdiff.add_argument( "--t1", dest = "t1bdg", type = str, required = True, - help = "MACS pileup bedGraph for condition 1. Incompatible with callpeak --SPMR output. REQUIRED" ) - argparser_bdgdiff.add_argument( "--t2", dest="t2bdg", type = str, required = True, - help = "MACS pileup bedGraph for condition 2. Incompatible with callpeak --SPMR output. REQUIRED" ) - argparser_bdgdiff.add_argument( "--c1", dest = "c1bdg", type = str, required = True, - help = "MACS control lambda bedGraph for condition 1. Incompatible with callpeak --SPMR output. REQUIRED" ) - argparser_bdgdiff.add_argument( "--c2", dest="c2bdg", type = str, required = True, - help = "MACS control lambda bedGraph for condition 2. Incompatible with callpeak --SPMR output. REQUIRED" ) - argparser_bdgdiff.add_argument( "-C", "--cutoff", dest = "cutoff", type = float, - help = "log10LR cutoff. Regions with signals lower than cutoff will not be considerred as enriched regions. DEFAULT: 3 (likelihood ratio=1000)", default = 3 ) - argparser_bdgdiff.add_argument( "-l", "--min-len", dest = "minlen", type = int, - help = "Minimum length of differential region. Try bigger value to remove small regions. DEFAULT: 200", default = 200 ) - argparser_bdgdiff.add_argument( "-g", "--max-gap", dest = "maxgap", type = int, - help = "Maximum gap to merge nearby differential regions. Consider a wider gap for broad marks. Maximum gap should be smaller than minimum length (-g). DEFAULT: 100", default = 100 ) - argparser_bdgdiff.add_argument( "--d1", "--depth1", dest = "depth1", type = float, default = 1.0, - help = "Sequencing depth (# of non-redundant reads in million) for condition 1. It will be used together with --d2. See description for --d2 below for how to assign them. Default: 1" ) - argparser_bdgdiff.add_argument( "--d2", "--depth2", dest = "depth2", type = float, default = 1.0, - help = "Sequencing depth (# of non-redundant reads in million) for condition 2. It will be used together with --d1. DEPTH1 and DEPTH2 will be used to calculate scaling factor for each sample, to down-scale larger sample to the level of smaller one. For example, while comparing 10 million condition 1 and 20 million condition 2, use --d1 10 --d2 20, then pileup value in bedGraph for condition 2 will be divided by 2. Default: 1" ) - argparser_bdgdiff.add_argument( "--verbose", dest = "verbose", type = int, default = 2, - help = "Set verbose level of runtime message. 0: only show critical message, 1: show additional warning message, 2: show process information, 3: show debug messages. DEFAULT:2" ) - - add_outdir_option( argparser_bdgdiff ) - output_group = argparser_bdgdiff.add_mutually_exclusive_group( required = True ) - output_group.add_argument( "--o-prefix", dest = "oprefix", type = str, - help = "Output file prefix. Actual files will be named as PREFIX_cond1.bed, PREFIX_cond2.bed and PREFIX_common.bed. Mutually exclusive with -o/--ofile." ) - output_group.add_argument( "-o", "--ofile", dest = "ofile", type = str, nargs = 3, - help = "Output filenames. Must give three arguments in order: 1. file for unique regions in condition 1; 2. file for unique regions in condition 2; 3. file for common regions in both conditions. Note: mutually exclusive with --o-prefix." ) + +def add_bdgdiff_parser(subparsers): + argparser_bdgdiff = subparsers.add_parser("bdgdiff", + help="Differential peak detection based on paired four bedGraph files.") + argparser_bdgdiff.add_argument("--t1", dest="t1bdg", type=str, required=True, + help="MACS pileup bedGraph for condition 1. Incompatible with callpeak --SPMR output. REQUIRED") + argparser_bdgdiff.add_argument("--t2", dest="t2bdg", type=str, required=True, + help="MACS pileup bedGraph for condition 2. Incompatible with callpeak --SPMR output. REQUIRED") + argparser_bdgdiff.add_argument("--c1", dest="c1bdg", type=str, required=True, + help="MACS control lambda bedGraph for condition 1. Incompatible with callpeak --SPMR output. REQUIRED") + argparser_bdgdiff.add_argument("--c2", dest="c2bdg", type=str, required=True, + help="MACS control lambda bedGraph for condition 2. Incompatible with callpeak --SPMR output. REQUIRED") + argparser_bdgdiff.add_argument("-C", "--cutoff", dest="cutoff", type=float, + help="log10LR cutoff. Regions with signals lower than cutoff will not be considerred as enriched regions. DEFAULT: 3 (likelihood ratio=1000)", default=3) + argparser_bdgdiff.add_argument("-l", "--min-len", dest="minlen", type=int, + help="Minimum length of differential region. Try bigger value to remove small regions. DEFAULT: 200", default=200) + argparser_bdgdiff.add_argument("-g", "--max-gap", dest="maxgap", type=int, + help="Maximum gap to merge nearby differential regions. Consider a wider gap for broad marks. Maximum gap should be smaller than minimum length (-g). DEFAULT: 100", default=100) + argparser_bdgdiff.add_argument("--d1", "--depth1", dest="depth1", type=float, default=1.0, + help="Sequencing depth (# of non-redundant reads in million) for condition 1. It will be used together with --d2. See description for --d2 below for how to assign them. Default: 1") + argparser_bdgdiff.add_argument("--d2", "--depth2", dest="depth2", type=float, default=1.0, + help="Sequencing depth (# of non-redundant reads in million) for condition 2. It will be used together with --d1. DEPTH1 and DEPTH2 will be used to calculate scaling factor for each sample, to down-scale larger sample to the level of smaller one. For example, while comparing 10 million condition 1 and 20 million condition 2, use --d1 10 --d2 20, then pileup value in bedGraph for condition 2 will be divided by 2. Default: 1") + argparser_bdgdiff.add_argument("--verbose", dest="verbose", type=int, default=2, + help="Set verbose level of runtime message. 0: only show critical message, 1: show additional warning message, 2: show process information, 3: show debug messages. DEFAULT:2") + + add_outdir_option(argparser_bdgdiff) + output_group = argparser_bdgdiff.add_mutually_exclusive_group(required=True) + output_group.add_argument("--o-prefix", dest="oprefix", type=str, + help="Output file prefix. Actual files will be named as PREFIX_cond1.bed, PREFIX_cond2.bed and PREFIX_common.bed. Mutually exclusive with -o/--ofile.") + output_group.add_argument("-o", "--ofile", dest="ofile", type=str, nargs=3, + help="Output filenames. Must give three arguments in order: 1. file for unique regions in condition 1; 2. file for unique regions in condition 2; 3. file for common regions in both conditions. Note: mutually exclusive with --o-prefix.") return -def add_refinepeak_parser( subparsers ): - argparser_refinepeak = subparsers.add_parser( "refinepeak", - help = "Take raw reads alignment, refine peak summits. Inspired by SPP." ) - argparser_refinepeak.add_argument( "-b", dest = "bedfile", type = str, required = True, - help = "Candidate peak file in BED format. REQUIRED." ) - argparser_refinepeak.add_argument( "-i", "--ifile", dest = "ifile", type = str, required = True, nargs = "+", - help = "ChIP-seq alignment file. If multiple files are given as '-t A B C', then they will all be read and combined. Note that pair-end data is not supposed to work with this command. REQUIRED." ) - argparser_refinepeak.add_argument( "-f", "--format", dest = "format", type = str, - choices=("AUTO","BAM","SAM","BED","ELAND","ELANDMULTI","ELANDEXPORT","BOWTIE"), - help = "Format of tag file, \"AUTO\", \"BED\" or \"ELAND\" or \"ELANDMULTI\" or \"ELANDEXPORT\" or \"SAM\" or \"BAM\" or \"BOWTIE\". The default AUTO option will let '%(prog)s' decide which format the file is. Please check the definition in README file if you choose ELAND/ELANDMULTI/ELANDEXPORT/SAM/BAM/BOWTIE. DEFAULT: \"AUTO\"", - default = "AUTO" ) - argparser_refinepeak.add_argument( "-c", "--cutoff" , dest = "cutoff", type = float, - help = "Cutoff. Regions with SPP wtd score lower than cutoff will not be considerred. DEFAULT: 5", default = 5 ) - argparser_refinepeak.add_argument( "-w", "--window-size", dest= "windowsize", help = 'Scan window size on both side of the summit (default: 100bp)', - type = int, default = 200) - argparser_refinepeak.add_argument( "--buffer-size", dest = "buffer_size", type = int, default = "100000", - help = "Buffer size for incrementally increasing internal array size to store reads alignment information. In most cases, you don't have to change this parameter. However, if there are large number of chromosomes/contigs/scaffolds in your alignment, it's recommended to specify a smaller buffer size in order to decrease memory usage (but it will take longer time to read alignment files). Minimum memory requested for reading an alignment file is about # of CHROMOSOME * BUFFER_SIZE * 8 Bytes. DEFAULT: 100000 " ) - - argparser_refinepeak.add_argument( "--verbose", dest = "verbose", type = int, default = 2, - help = "Set verbose level. 0: only show critical message, 1: show additional warning message, 2: show process information, 3: show debug messages. If you want to know where are the duplicate reads, use 3. DEFAULT:2" ) - - add_outdir_option( argparser_refinepeak ) - add_output_group( argparser_refinepeak ) +def add_refinepeak_parser(subparsers): + argparser_refinepeak = subparsers.add_parser("refinepeak", + help="Take raw reads alignment, refine peak summits. Inspired by SPP.") + argparser_refinepeak.add_argument("-b", dest="bedfile", type=str, required=True, + help="Candidate peak file in BED format. REQUIRED.") + argparser_refinepeak.add_argument("-i", "--ifile", dest="ifile", type=str, required=True, nargs="+", + help="ChIP-seq alignment file. If multiple files are given as '-t A B C', then they will all be read and combined. Note that pair-end data is not supposed to work with this command. REQUIRED.") + argparser_refinepeak.add_argument("-f", "--format", dest="format", type=str, + choices=("AUTO", "BAM", "SAM", "BED", "ELAND", "ELANDMULTI", "ELANDEXPORT", "BOWTIE"), + help="Format of tag file, \"AUTO\", \"BED\" or \"ELAND\" or \"ELANDMULTI\" or \"ELANDEXPORT\" or \"SAM\" or \"BAM\" or \"BOWTIE\". The default AUTO option will let '%(prog)s' decide which format the file is. Please check the definition in README file if you choose ELAND/ELANDMULTI/ELANDEXPORT/SAM/BAM/BOWTIE. DEFAULT: \"AUTO\"", + default="AUTO") + argparser_refinepeak.add_argument("-c", "--cutoff", dest="cutoff", type=float, + help="Cutoff. Regions with SPP wtd score lower than cutoff will not be considerred. DEFAULT: 5", default=5) + argparser_refinepeak.add_argument("-w", "--window-size", dest="windowsize", help='Scan window size on both side of the summit (default: 100bp)', + type=int, default=200) + argparser_refinepeak.add_argument("--buffer-size", dest="buffer_size", type=int, default="100000", + help="Buffer size for incrementally increasing internal array size to store reads alignment information. In most cases, you don't have to change this parameter. However, if there are large number of chromosomes/contigs/scaffolds in your alignment, it's recommended to specify a smaller buffer size in order to decrease memory usage (but it will take longer time to read alignment files). Minimum memory requested for reading an alignment file is about # of CHROMOSOME * BUFFER_SIZE * 8 Bytes. DEFAULT: 100000 ") + + argparser_refinepeak.add_argument("--verbose", dest="verbose", type=int, default=2, + help="Set verbose level. 0: only show critical message, 1: show additional warning message, 2: show process information, 3: show debug messages. If you want to know where are the duplicate reads, use 3. DEFAULT:2") + + add_outdir_option(argparser_refinepeak) + add_output_group(argparser_refinepeak) return -def add_predictd_parser( subparsers ): + +def add_predictd_parser(subparsers): """Add main function 'predictd' argument parsers. """ argparser_predictd = subparsers.add_parser("predictd", help="Predict d or fragment size from alignment results. In case of PE data, report the average insertion/fragment size from all pairs.") # group for input files - argparser_predictd.add_argument( "-i", "--ifile", dest = "ifile", type = str, required = True, nargs = "+", - help = "ChIP-seq alignment file. If multiple files are given as '-t A B C', then they will all be read and combined. REQUIRED." ) - argparser_predictd.add_argument( "-f", "--format", dest = "format", type = str, - choices = ("AUTO", "BAM", "SAM", "BED", "ELAND", - "ELANDMULTI", "ELANDEXPORT", "BOWTIE", - "BAMPE", "BEDPE"), - help = "Format of tag file, \"AUTO\", \"BED\" or \"ELAND\" or \"ELANDMULTI\" or \"ELANDEXPORT\" or \"SAM\" or \"BAM\" or \"BOWTIE\" or \"BAMPE\" or \"BEDPE\". The default AUTO option will let MACS decide which format the file is. However, if you want to decide the average insertion size/fragment size from PE data such as BEDPE or BAMPE, please specify the format as BAMPE or BEDPE since MACS3 won't automatically recognize three two formats with -f AUTO. Please be aware that in PE mode, -g, -s, --bw, --d-min, -m, and --rfile have NO effect. DEFAULT: \"AUTO\"", - default = "AUTO" ) - argparser_predictd.add_argument( "-g", "--gsize", dest = "gsize", type = str, default = "hs", - help = "Effective genome size. It can be 1.0e+9 or 1000000000, or shortcuts:'hs' for human (2,913,022,398), 'mm' for mouse (2,652,783,500), 'ce' for C. elegans (100,286,401) and 'dm' for fruitfly (142,573,017), Default:hs. The effective genome size numbers for the above four species are collected from Deeptools https://deeptools.readthedocs.io/en/develop/content/feature/effectiveGenomeSize.html Please refer to deeptools to define the best genome size you plan to use.") - argparser_predictd.add_argument( "-s", "--tsize", dest = "tsize", type = int, default = None, - help = "Tag size. This will override the auto detected tag size. DEFAULT: Not set") - argparser_predictd.add_argument( "--bw", dest = "bw", type = int, default = 300, - help = "Band width for picking regions to compute fragment size. This value is only used while building the shifting model. DEFAULT: 300") - argparser_predictd.add_argument( "--d-min", dest = "d_min", type = int, default = 20, - help = "Minimum fragment size in basepair. Any predicted fragment size less than this will be excluded. DEFAULT: 20") - argparser_predictd.add_argument( "-m", "--mfold", dest = "mfold", type = int, default = [5,50], nargs = 2, - help = "Select the regions within MFOLD range of high-confidence enrichment ratio against background to build model. Fold-enrichment in regions must be lower than upper limit, and higher than the lower limit. Use as \"-m 10 30\". DEFAULT:5 50" ) - - add_outdir_option( argparser_predictd ) - argparser_predictd.add_argument( "--rfile", dest = "rfile", type = str, default = "predictd_model.R", - help = "PREFIX of filename of R script for drawing X-correlation figure. DEFAULT:'predictd_model.R' and R file will be predicted_model.R" ) - argparser_predictd.add_argument( "--buffer-size", dest = "buffer_size", type = int, default = "100000", - help = "Buffer size for incrementally increasing internal array size to store reads alignment information. In most cases, you don't have to change this parameter. However, if there are large number of chromosomes/contigs/scaffolds in your alignment, it's recommended to specify a smaller buffer size in order to decrease memory usage (but it will take longer time to read alignment files). Minimum memory requested for reading an alignment file is about # of CHROMOSOME * BUFFER_SIZE * 8 Bytes. DEFAULT: 100000 " ) - - argparser_predictd.add_argument( "--verbose", dest = "verbose", type = int, default = 2, - help = "Set verbose level of runtime message. 0: only show critical message, 1: show additional warning message, 2: show process information, 3: show debug messages. DEFAULT:2" ) + argparser_predictd.add_argument("-i", "--ifile", dest="ifile", type=str, required=True, nargs="+", + help="ChIP-seq alignment file. If multiple files are given as '-t A B C', then they will all be read and combined. REQUIRED.") + argparser_predictd.add_argument("-f", "--format", dest="format", type=str, + choices=("AUTO", "BAM", "SAM", "BED", "ELAND", + "ELANDMULTI", "ELANDEXPORT", "BOWTIE", + "BAMPE", "BEDPE"), + help="Format of tag file, \"AUTO\", \"BED\" or \"ELAND\" or \"ELANDMULTI\" or \"ELANDEXPORT\" or \"SAM\" or \"BAM\" or \"BOWTIE\" or \"BAMPE\" or \"BEDPE\". The default AUTO option will let MACS decide which format the file is. However, if you want to decide the average insertion size/fragment size from PE data such as BEDPE or BAMPE, please specify the format as BAMPE or BEDPE since MACS3 won't automatically recognize three two formats with -f AUTO. Please be aware that in PE mode, -g, -s, --bw, --d-min, -m, and --rfile have NO effect. DEFAULT: \"AUTO\"", + default="AUTO") + argparser_predictd.add_argument("-g", "--gsize", dest="gsize", type=str, default="hs", + help="Effective genome size. It can be 1.0e+9 or 1000000000, or shortcuts:'hs' for human (2,913,022,398), 'mm' for mouse (2,652,783,500), 'ce' for C. elegans (100,286,401) and 'dm' for fruitfly (142,573,017), Default:hs. The effective genome size numbers for the above four species are collected from Deeptools https://deeptools.readthedocs.io/en/develop/content/feature/effectiveGenomeSize.html Please refer to deeptools to define the best genome size you plan to use.") + argparser_predictd.add_argument("-s", "--tsize", dest="tsize", type=int, default=None, + help="Tag size. This will override the auto detected tag size. DEFAULT: Not set") + argparser_predictd.add_argument("--bw", dest="bw", type=int, default=300, + help="Band width for picking regions to compute fragment size. This value is only used while building the shifting model. DEFAULT: 300") + argparser_predictd.add_argument("--d-min", dest="d_min", type=int, default=20, + help="Minimum fragment size in basepair. Any predicted fragment size less than this will be excluded. DEFAULT: 20") + argparser_predictd.add_argument("-m", "--mfold", dest="mfold", type=int, default=[5,50], nargs=2, + help="Select the regions within MFOLD range of high-confidence enrichment ratio against background to build model. Fold-enrichment in regions must be lower than upper limit, and higher than the lower limit. Use as \"-m 10 30\". DEFAULT:5 50") + + add_outdir_option(argparser_predictd) + argparser_predictd.add_argument("--rfile", dest="rfile", type=str, default="predictd_model.R", + help="PREFIX of filename of R script for drawing X-correlation figure. DEFAULT:'predictd_model.R' and R file will be predicted_model.R") + argparser_predictd.add_argument("--buffer-size", dest="buffer_size", type=int, default="100000", + help="Buffer size for incrementally increasing internal array size to store reads alignment information. In most cases, you don't have to change this parameter. However, if there are large number of chromosomes/contigs/scaffolds in your alignment, it's recommended to specify a smaller buffer size in order to decrease memory usage (but it will take longer time to read alignment files). Minimum memory requested for reading an alignment file is about # of CHROMOSOME * BUFFER_SIZE * 8 Bytes. DEFAULT: 100000 ") + + argparser_predictd.add_argument("--verbose", dest="verbose", type=int, default=2, + help="Set verbose level of runtime message. 0: only show critical message, 1: show additional warning message, 2: show process information, 3: show debug messages. DEFAULT:2") return -def add_pileup_parser( subparsers ): - argparser_pileup = subparsers.add_parser( "pileup", - help = "Pileup aligned reads (single-end) or fragments (paired-end)." ) - argparser_pileup.add_argument( "-i", "--ifile", dest = "ifile", type = str, required = True, nargs = "+", - help = "Alignment file. If multiple files are given as '-t A B C', then they will all be read and combined. REQUIRED." ) - argparser_pileup.add_argument( "-o", "--ofile", dest = "outputfile", type = str, required = True, - help = "Output bedGraph file name. If not specified, will write to standard output. REQUIRED." ) - add_outdir_option( argparser_pileup ) - argparser_pileup.add_argument( "-f", "--format", dest = "format", type = str, - choices=("AUTO","BAM","SAM","BED","ELAND","ELANDMULTI","ELANDEXPORT","BOWTIE","BAMPE","BEDPE"), - help = "Format of tag file, \"AUTO\", \"BED\", \"ELAND\", \"ELANDMULTI\", \"ELANDEXPORT\", \"SAM\", \"BAM\", \"BOWTIE\", \"BAMPE\", or \"BEDPE\". The default AUTO option will let '%(prog)s' decide which format the file is. DEFAULT: \"AUTO\", MACS3 will pick a format from \"AUTO\", \"BED\", \"ELAND\", \"ELANDMULTI\", \"ELANDEXPORT\", \"SAM\", \"BAM\" and \"BOWTIE\". If the format is BAMPE or BEDPE, please specify it explicitly. Please note that when the format is BAMPE or BEDPE, the -B and --extsize options would be ignored.", - default = "AUTO" ) - argparser_pileup.add_argument( "-B", "--both-direction", dest = "bothdirection", action = "store_true", default = False, - help = "By default, any read will be extended towards downstream direction by extension size. So it's [0,size-1] (1-based index system) for plus strand read and [-size+1,0] for minus strand read where position 0 is 5' end of the aligned read. Default behavior can simulate MACS3 way of piling up ChIP sample reads where extension size is set as fragment size/d. If this option is set as on, aligned reads will be extended in both upstream and downstream directions by extension size. It means [-size,size] where 0 is the 5' end of a aligned read. It can partially simulate MACS3 way of piling up control reads. However MACS3 local bias is calculated by maximizing the expected pileup over a ChIP fragment size/d estimated from 10kb, 1kb, d and whole genome background. This option will be ignored when the format is set as BAMPE or BEDPE. DEFAULT: False" ) - - argparser_pileup.add_argument( "--extsize", dest = "extsize", type = int, default = 200, - help = "The extension size in bps. Each alignment read will become a EXTSIZE of fragment, then be piled up. Check description for -B for detail. It's twice the `shiftsize` in old MACSv1 language. This option will be ignored when the format is set as BAMPE or BEDPE. DEFAULT: 200 " ) - argparser_pileup.add_argument( "--buffer-size", dest = "buffer_size", type = int, default = "100000", - help = "Buffer size for incrementally increasing internal array size to store reads alignment information. In most cases, you don't have to change this parameter. However, if there are large number of chromosomes/contigs/scaffolds in your alignment, it's recommended to specify a smaller buffer size in order to decrease memory usage (but it will take longer time to read alignment files). Minimum memory requested for reading an alignment file is about # of CHROMOSOME * BUFFER_SIZE * 8 Bytes. DEFAULT: 100000 " ) - - argparser_pileup.add_argument( "--verbose", dest = "verbose", type = int, default = 2, - help = "Set verbose level. 0: only show critical message, 1: show additional warning message, 2: show process information, 3: show debug messages. If you want to know where are the duplicate reads, use 3. DEFAULT:2" ) + +def add_pileup_parser(subparsers): + argparser_pileup = subparsers.add_parser("pileup", + help="Pileup aligned reads (single-end) or fragments (paired-end).") + argparser_pileup.add_argument("-i", "--ifile", dest="ifile", type=str, required=True, nargs="+", + help="Alignment file. If multiple files are given as '-t A B C', then they will all be read and combined. REQUIRED.") + argparser_pileup.add_argument("-o", "--ofile", dest="outputfile", type=str, required=True, + help="Output bedGraph file name. If not specified, will write to standard output. REQUIRED.") + add_outdir_option(argparser_pileup) + argparser_pileup.add_argument("-f", "--format", dest="format", type=str, + choices=("AUTO", "BAM", "SAM", "BED", "ELAND", "ELANDMULTI", "ELANDEXPORT", "BOWTIE", "BAMPE", "BEDPE"), + help="Format of tag file, \"AUTO\", \"BED\", \"ELAND\", \"ELANDMULTI\", \"ELANDEXPORT\", \"SAM\", \"BAM\", \"BOWTIE\", \"BAMPE\", or \"BEDPE\". The default AUTO option will let '%(prog)s' decide which format the file is. DEFAULT: \"AUTO\", MACS3 will pick a format from \"AUTO\", \"BED\", \"ELAND\", \"ELANDMULTI\", \"ELANDEXPORT\", \"SAM\", \"BAM\" and \"BOWTIE\". If the format is BAMPE or BEDPE, please specify it explicitly. Please note that when the format is BAMPE or BEDPE, the -B and --extsize options would be ignored.", + default="AUTO") + argparser_pileup.add_argument("-B", "--both-direction", dest="bothdirection", action="store_true", default=False, + help="By default, any read will be extended towards downstream direction by extension size. So it's [0,size-1] (1-based index system) for plus strand read and [-size+1,0] for minus strand read where position 0 is 5' end of the aligned read. Default behavior can simulate MACS3 way of piling up ChIP sample reads where extension size is set as fragment size/d. If this option is set as on, aligned reads will be extended in both upstream and downstream directions by extension size. It means [-size,size] where 0 is the 5' end of a aligned read. It can partially simulate MACS3 way of piling up control reads. However MACS3 local bias is calculated by maximizing the expected pileup over a ChIP fragment size/d estimated from 10kb, 1kb, d and whole genome background. This option will be ignored when the format is set as BAMPE or BEDPE. DEFAULT: False") + + argparser_pileup.add_argument("--extsize", dest="extsize", type=int, default=200, + help="The extension size in bps. Each alignment read will become a EXTSIZE of fragment, then be piled up. Check description for -B for detail. It's twice the `shiftsize` in old MACSv1 language. This option will be ignored when the format is set as BAMPE or BEDPE. DEFAULT: 200 ") + argparser_pileup.add_argument("--buffer-size", dest="buffer_size", type=int, default="100000", + help="Buffer size for incrementally increasing internal array size to store reads alignment information. In most cases, you don't have to change this parameter. However, if there are large number of chromosomes/contigs/scaffolds in your alignment, it's recommended to specify a smaller buffer size in order to decrease memory usage (but it will take longer time to read alignment files). Minimum memory requested for reading an alignment file is about # of CHROMOSOME * BUFFER_SIZE * 8 Bytes. DEFAULT: 100000 ") + + argparser_pileup.add_argument("--verbose", dest="verbose", type=int, default=2, + help="Set verbose level. 0: only show critical message, 1: show additional warning message, 2: show process information, 3: show debug messages. If you want to know where are the duplicate reads, use 3. DEFAULT:2") return -def add_callvar_parser( subparsers ): + +def add_callvar_parser(subparsers): """Add function 'variant calling' argument parsers. """ argparser_callvar = subparsers.add_parser("callvar", - formatter_class = ap.RawDescriptionHelpFormatter, + formatter_class=ap.RawDescriptionHelpFormatter, help="Call variants in given peak regions from the alignment BAM files.", - epilog = """ Assuming you have two types of BAM files. The first type, what we + epilog=""" Assuming you have two types of BAM files. The first type, what we call `TREAT`, is from DNA enrichment assay such as ChIP-seq or ATAC-seq where the DNA fragments in the sequencing library are enriched in certain genomics regions with potential allele biases; the @@ -630,53 +645,54 @@ To call variants: $ macs3 callvar -b peaks.bed -t TREAT_sorted.bam -c CTRL_sorted.bam -o peaks.vcf """) # group for input files - group_input = argparser_callvar.add_argument_group( "Input files arguments" ) - group_input.add_argument( "-b", "--peak", dest = "peakbed", type = str, required =True, - help = "Peak regions in BED format, sorted by coordinates. REQUIRED." ) - group_input.add_argument( "-t", "--treatment", dest = "tfile", type = str, required = True, - help = "ChIP-seq/ATAC-seq treatment file in BAM format, sorted by coordinates. Make sure the .bai file is avaiable in the same directory. REQUIRED." ) - group_input.add_argument( "-c", "--control", dest = "cfile", type = str, required = False, - help = "Optional control file in BAM format, sorted by coordinates. Make sure the .bai file is avaiable in the same directory.") + group_input = argparser_callvar.add_argument_group("Input files arguments") + group_input.add_argument("-b", "--peak", dest="peakbed", type=str, required =True, + help="Peak regions in BED format, sorted by coordinates. REQUIRED.") + group_input.add_argument("-t", "--treatment", dest="tfile", type=str, required=True, + help="ChIP-seq/ATAC-seq treatment file in BAM format, sorted by coordinates. Make sure the .bai file is avaiable in the same directory. REQUIRED.") + group_input.add_argument("-c", "--control", dest="cfile", type=str, required=False, + help="Optional control file in BAM format, sorted by coordinates. Make sure the .bai file is avaiable in the same directory.") # group for output files - group_output = argparser_callvar.add_argument_group( "Output arguments" ) - add_outdir_option( group_output ) - group_output.add_argument( "-o", "--ofile", dest = "ofile", type = str, required = True, - help = "Output VCF file name." ) - group_output.add_argument( "--verbose", dest = "verbose", type = int, default = 2, - help = "Set verbose level of runtime message. 0: only show critical message, 1: show additional warning message, 2: show process information, 3: show debug messages. DEFAULT:2" ) + group_output = argparser_callvar.add_argument_group("Output arguments") + add_outdir_option(group_output) + group_output.add_argument("-o", "--ofile", dest="ofile", type=str, required=True, + help="Output VCF file name.") + group_output.add_argument("--verbose", dest="verbose", type=int, default=2, + help="Set verbose level of runtime message. 0: only show critical message, 1: show additional warning message, 2: show process information, 3: show debug messages. DEFAULT:2") # group for parameters - group_para = argparser_callvar.add_argument_group( "Variant calling arguments" ) - group_para.add_argument( "-g", "--gq-hetero", dest = "GQCutoffHetero", type = float, - help = "Genotype Quality score (-10log10((L00+L11)/(L01+L00+L11))) cutoff for Heterozygous allele type. Default:0, or there is no cutoff on GQ.", default = 0 ) - group_para.add_argument( "-G", "--gq-homo", dest = "GQCutoffHomo", type = float, - help = "Genotype Quality score (-10log10((L00+L01)/(L01+L00+L11))) cutoff for Homozygous allele (not the same as reference) type. Default:0, or ther is no cutoff on GQ.", default = 0 ) - group_para.add_argument( "-Q", dest = "Q", type = int, default = 20, - help = "Only consider bases with quality score greater than this value. Default: 20, which means Q20 or 0.01 error rate." ) - group_para.add_argument( "-D", dest = "maxDuplicate", type = int, default = 1, - help = "Maximum duplicated reads allowed per mapping position, mapping strand and the same CIGAR code. Default: 1. When sequencing depth is high, to set a higher value might help evaluate the correct allele ratio.") - group_para.add_argument( "-F", "--fermi", dest = "fermi", type = str, default = "auto", - help = "Option to control when to apply local assembly through fermi-lite. By default (set as 'auto'), while callvar detects any INDEL variant in a peak region, it will utilize fermi-lite to recover the actual DNA sequences to refine the read alignments. If set as 'on', fermi-lite will be always invoked. It can increase specificity however sensivity and speed will be significantly lower. If set as 'off', Fermi won't be invoked at all. If so, speed and sensitivity can be higher but specificity will be significantly lower. Default: auto" ) - group_para.add_argument( "--fermi-overlap", dest = "fermiMinOverlap", type = int, - help = "The minimal overlap for fermi to initially assemble two reads. Must be between 1 and read length. A longer fermiMinOverlap is needed while read length is small (e.g. 30 for 36bp read, but 33 for 100bp read may work). Default:30", default = 30 ) - group_para.add_argument( "--top2alleles-mratio", dest = "top2allelesMinRatio", type = float, - help = "The reads for the top 2 most frequent alleles (e.g. a ref allele and an alternative allele) at a loci shouldn't be too few comparing to total reads mapped. The minimum ratio is set by this optoin. Must be a float between 0.5 and 1. Default:0.8 which means at least 80%% of reads contain the top 2 alleles.", default = 0.8 ) - group_para.add_argument( "--altallele-count", dest = "altalleleMinCount", type = int, - help = "The count of the alternative (non-reference) allele at a loci shouldn't be too few. By default, we require at least two reads support the alternative allele. Default:2", default = 2 ) - group_para.add_argument( "--max-ar", dest = "maxAR", type = float, - help = "The maximum Allele-Ratio allowed while calculating likelihood for allele-specific binding. If we allow higher maxAR, we may mistakenly assign some homozygous loci as heterozygous. Default:0.95", default = 0.95 ) + group_para = argparser_callvar.add_argument_group("Variant calling arguments") + group_para.add_argument("-g", "--gq-hetero", dest="GQCutoffHetero", type=float, + help="Genotype Quality score (-10log10((L00+L11)/(L01+L00+L11))) cutoff for Heterozygous allele type. Default:0, or there is no cutoff on GQ.", default=0) + group_para.add_argument("-G", "--gq-homo", dest="GQCutoffHomo", type=float, + help="Genotype Quality score (-10log10((L00+L01)/(L01+L00+L11))) cutoff for Homozygous allele (not the same as reference) type. Default:0, or ther is no cutoff on GQ.", default=0) + group_para.add_argument("-Q", dest="Q", type=int, default=20, + help="Only consider bases with quality score greater than this value. Default: 20, which means Q20 or 0.01 error rate.") + group_para.add_argument("-D", dest="maxDuplicate", type=int, default=1, + help="Maximum duplicated reads allowed per mapping position, mapping strand and the same CIGAR code. Default: 1. When sequencing depth is high, to set a higher value might help evaluate the correct allele ratio.") + group_para.add_argument("-F", "--fermi", dest="fermi", type=str, default="auto", + help="Option to control when to apply local assembly through fermi-lite. By default (set as 'auto'), while callvar detects any INDEL variant in a peak region, it will utilize fermi-lite to recover the actual DNA sequences to refine the read alignments. If set as 'on', fermi-lite will be always invoked. It can increase specificity however sensivity and speed will be significantly lower. If set as 'off', Fermi won't be invoked at all. If so, speed and sensitivity can be higher but specificity will be significantly lower. Default: auto") + group_para.add_argument("--fermi-overlap", dest="fermiMinOverlap", type=int, + help="The minimal overlap for fermi to initially assemble two reads. Must be between 1 and read length. A longer fermiMinOverlap is needed while read length is small (e.g. 30 for 36bp read, but 33 for 100bp read may work). Default:30", default=30) + group_para.add_argument("--top2alleles-mratio", dest="top2allelesMinRatio", type=float, + help="The reads for the top 2 most frequent alleles (e.g. a ref allele and an alternative allele) at a loci shouldn't be too few comparing to total reads mapped. The minimum ratio is set by this optoin. Must be a float between 0.5 and 1. Default:0.8 which means at least 80%% of reads contain the top 2 alleles.", default=0.8) + group_para.add_argument("--altallele-count", dest="altalleleMinCount", type=int, + help="The count of the alternative (non-reference) allele at a loci shouldn't be too few. By default, we require at least two reads support the alternative allele. Default:2", default=2) + group_para.add_argument("--max-ar", dest="maxAR", type=float, + help="The maximum Allele-Ratio allowed while calculating likelihood for allele-specific binding. If we allow higher maxAR, we may mistakenly assign some homozygous loci as heterozygous. Default:0.95", default=0.95) # group for misc - group_misc = argparser_callvar.add_argument_group( "Misc arguments" ) - group_misc.add_argument( "-m", "--multiple-processing", dest = "np", type = int, default = 1, - help = "CPU used for mutliple processing. Please note that, assigning more CPUs does not guarantee the process being faster. Creating too many parrallel processes need memory operations and may negate benefit from multi processing. Default: 1" ) + group_misc = argparser_callvar.add_argument_group("Misc arguments") + group_misc.add_argument("-m", "--multiple-processing", dest="np", type=int, default=1, + help="CPU used for mutliple processing. Please note that, assigning more CPUs does not guarantee the process being faster. Creating too many parrallel processes need memory operations and may negate benefit from multi processing. Default: 1") return -def add_hmmratac_parser( subparsers ): + +def add_hmmratac_parser(subparsers): """Add function 'HMMRATAC' argument parsers. """ argparser_hmmratac = subparsers.add_parser("hmmratac", - formatter_class = ap.RawDescriptionHelpFormatter, - help="Dedicated peak calling based on Hidden Markov Model for ATAC-seq data.", - epilog = """HMMRATAC is a dedicated tool for processing ATAC-seq data + formatter_class=ap.RawDescriptionHelpFormatter, + help="Dedicated peak calling based on Hidden Markov Model for ATAC-seq data.", + epilog="""HMMRATAC is a dedicated tool for processing ATAC-seq data HMMRATAC, first released as a JAVA program in 2019, is a dedicated tool specifically designed for processing ATAC-seq data. In MACS3, it @@ -775,7 +791,7 @@ While there's no universal rule, here are a few suggestions: normal length (average length 500-1000bps). However, it's recommended not to use the lowest cutoff value in the report as this may include too much noise from the genome. - + * Tune the HMM model * It's highly recommended to check the runtime message of the HMM model @@ -818,8 +834,8 @@ assignment in the HMM model file (e.g. test_model.json). For the above example, the model.json looks like this (we skipped some detail): ``` -{"startprob": [...], "transmat": [...], "means": [...], "covars": [...], -"covariance_type": "full", "n_features": 4, +{"startprob": [...], "transmat": [...], "means": [...], "covars": [...], +"covariance_type": "full", "n_features": 4, "i_open_region": 2, "i_background_region": 0, "i_nucleosomal_region": 1, "hmm_binsize": 10} ``` @@ -835,145 +851,145 @@ plus an extra option for the HMM model file like `macs3 hmmratac --model new_model.json` """) - + # group for input files - group_input = argparser_hmmratac.add_argument_group( "Input files arguments" ) - group_input.add_argument( "-i", "--input", dest = "input_file", type = str, required = True, nargs = "+", - help = "Input files containing the aligment results for ATAC-seq paired end reads. If multiple files are given as '-t A B C', then they will all be read and pooled together. The file should be in BAMPE or BEDPE format (aligned in paired end mode). Files can be gzipped. Note: all files should be in the same format! REQUIRED." ) - group_input.add_argument( "-f", "--format", dest = "format", type = str, - choices = ("BAMPE", "BEDPE"), - help = "Format of input files, \"BAMPE\" or \"BEDPE\". If there are multiple files, they should be in the same format -- either BAMPE or BEDPE. Please check the definition in README. Also please note that the BEDPE only contains three columns -- chromosome, left position of the whole pair, right position of the whole pair-- and is NOT the same BEDPE format used by BEDTOOLS. To convert BAMPE to BEDPE, you can use this command `macs3 filterdup --keep-dup all -f BAMPE -i input.bam -o output.bedpe`. DEFAULT: \"BAMPE\"", - default = "BAMPE" ) - + group_input = argparser_hmmratac.add_argument_group("Input files arguments") + group_input.add_argument("-i", "--input", dest="input_file", type=str, required=True, nargs="+", + help="Input files containing the aligment results for ATAC-seq paired end reads. If multiple files are given as '-t A B C', then they will all be read and pooled together. The file should be in BAMPE or BEDPE format (aligned in paired end mode). Files can be gzipped. Note: all files should be in the same format! REQUIRED.") + group_input.add_argument("-f", "--format", dest="format", type=str, + choices=("BAMPE", "BEDPE"), + help="Format of input files, \"BAMPE\" or \"BEDPE\". If there are multiple files, they should be in the same format -- either BAMPE or BEDPE. Please check the definition in README. Also please note that the BEDPE only contains three columns -- chromosome, left position of the whole pair, right position of the whole pair-- and is NOT the same BEDPE format used by BEDTOOLS. To convert BAMPE to BEDPE, you can use this command `macs3 filterdup --keep-dup all -f BAMPE -i input.bam -o output.bedpe`. DEFAULT: \"BAMPE\"", + default="BAMPE") + # group for output files - group_output = argparser_hmmratac.add_argument_group( "Output arguments" ) - - add_outdir_option( group_output ) - group_output.add_argument( "-n", "--name", dest = "name", type = str, - help = "Name for this experiment, which will be used as a prefix to generate output file names. DEFAULT: \"NA\"", - default = "NA" ) - group_output.add_argument( "--cutoff-analysis-only", dest = "cutoff_analysis_only", action = "store_true", - help = "Only run the cutoff analysis and output a report. After generating the report, the process will stop. By default, the cutoff analysis will be included in the whole process, but won't quit after the report is generated. The report will help user decide the three crucial parameters for `-l`, `-u`, and `-c`. So it's highly recommanded to run this first! Please read the report and instructions in `Choices of cutoff values` on how to decide the three crucial parameters. The resolution of cutoff analysis can be controlled by --cutoff-analysis-max and --cutoff-analysis-steps options.", - default = False ) - group_output.add_argument( "--cutoff-analysis-max", dest="cutoff_analysis_max", type = int, - help = "The maximum cutoff score for performing cutoff analysis. Together with --cutoff-analysis-steps, the resolution in the final report can be controlled. Please check the description in --cutoff-analysis-steps for detail. DEFAULT: 100", - default = 100 ) - group_output.add_argument( "--cutoff-analysis-steps", dest="cutoff_analysis_steps", type = int, - help = "Steps for performing cutoff analysis. It will be used to decide which cutoff value should be included in the final report. Larger the value, higher resolution the cutoff analysis can be. The cutoff analysis function will first find the smallest (at least 0) and the largest (controlled by --cutoff-analysis-max) foldchange score in the data, then break the range of foldchange score into `CUTOFF_ANALYSIS_STEPS` intervals. It will then use each foldchange score as cutoff to call peaks and calculate the total number of candidate peaks, the total basepairs of peaks, and the average length of peak in basepair. Please note that the final report ideally should include `CUTOFF_ANALYSIS_STEPS` rows, but in practice, if the foldchange cutoff yield zero peak, the row for that foldchange value won't be included. DEFAULT: 100", - default = 100 ) - group_output.add_argument( "--save-digested", dest = "save_digested", action = "store_true", - help = "Save the digested ATAC signals of short-, mono-, di-, and tri- signals in three BedGraph files with the names NAME_short.bdg, NAME_mono.bdg, NAME_di.bdg, and NAME_tri.bdg. DEFAULT: False", - default = False ) - group_output.add_argument( "--save-states", dest = "save_states", action = "store_true", - help = "Save all open and nucleosomal state annotations into a BED file with the name NAME_states.bed. DEFAULT: False", - default = False ) - group_output.add_argument( "--save-likelihoods", dest = "save_likelihoods", action = "store_true", - help = "Save the likelihoods to each state annotation in three BedGraph files, named with NAME_open.bdg for open states, NAME_nuc.bdg for nucleosomal states, and NAME_bg.bdg for the background states. DEFAULT: False", - default = False ) - #group_output.add_argument( "--no-peaks", dest = "store_peaks", action = "store_true", - # help = "Do not report peaks in bed format. Default: false", - # default = False ) - #group_output.add_argument( "--printExclude", dest = "print_exclude", action = "store_true", - # help = "Output excluded regions into Output_exclude.bed. Default: False", - # default = False ) - group_output.add_argument( "--save-training-data", dest = "save_train", action = "store_true", - help = "Save the training regions and training data into NAME_training_regions.bed and NAME_training_data.txt. Default: False", - default = False ) - + group_output = argparser_hmmratac.add_argument_group("Output arguments") + + add_outdir_option(group_output) + group_output.add_argument("-n", "--name", dest="name", type=str, + help="Name for this experiment, which will be used as a prefix to generate output file names. DEFAULT: \"NA\"", + default="NA") + group_output.add_argument("--cutoff-analysis-only", dest="cutoff_analysis_only", action="store_true", + help="Only run the cutoff analysis and output a report. After generating the report, the process will stop. By default, the cutoff analysis will be included in the whole process, but won't quit after the report is generated. The report will help user decide the three crucial parameters for `-l`, `-u`, and `-c`. So it's highly recommanded to run this first! Please read the report and instructions in `Choices of cutoff values` on how to decide the three crucial parameters. The resolution of cutoff analysis can be controlled by --cutoff-analysis-max and --cutoff-analysis-steps options.", + default=False) + group_output.add_argument("--cutoff-analysis-max", dest="cutoff_analysis_max", type=int, + help="The maximum cutoff score for performing cutoff analysis. Together with --cutoff-analysis-steps, the resolution in the final report can be controlled. Please check the description in --cutoff-analysis-steps for detail. DEFAULT: 100", + default=100) + group_output.add_argument("--cutoff-analysis-steps", dest="cutoff_analysis_steps", type=int, + help="Steps for performing cutoff analysis. It will be used to decide which cutoff value should be included in the final report. Larger the value, higher resolution the cutoff analysis can be. The cutoff analysis function will first find the smallest (at least 0) and the largest (controlled by --cutoff-analysis-max) foldchange score in the data, then break the range of foldchange score into `CUTOFF_ANALYSIS_STEPS` intervals. It will then use each foldchange score as cutoff to call peaks and calculate the total number of candidate peaks, the total basepairs of peaks, and the average length of peak in basepair. Please note that the final report ideally should include `CUTOFF_ANALYSIS_STEPS` rows, but in practice, if the foldchange cutoff yield zero peak, the row for that foldchange value won't be included. DEFAULT: 100", + default=100) + group_output.add_argument("--save-digested", dest="save_digested", action="store_true", + help="Save the digested ATAC signals of short-, mono-, di-, and tri- signals in three BedGraph files with the names NAME_short.bdg, NAME_mono.bdg, NAME_di.bdg, and NAME_tri.bdg. DEFAULT: False", + default=False) + group_output.add_argument("--save-states", dest="save_states", action="store_true", + help="Save all open and nucleosomal state annotations into a BED file with the name NAME_states.bed. DEFAULT: False", + default=False) + group_output.add_argument("--save-likelihoods", dest="save_likelihoods", action="store_true", + help="Save the likelihoods to each state annotation in three BedGraph files, named with NAME_open.bdg for open states, NAME_nuc.bdg for nucleosomal states, and NAME_bg.bdg for the background states. DEFAULT: False", + default=False) + # group_output.add_argument("--no-peaks", dest="store_peaks", action="store_true", + # help="Do not report peaks in bed format. Default: false", + # default=False) + # group_output.add_argument("--printExclude", dest="print_exclude", action="store_true", + # help="Output excluded regions into Output_exclude.bed. Default: False", + # default=False) + group_output.add_argument("--save-training-data", dest="save_train", action="store_true", + help="Save the training regions and training data into NAME_training_regions.bed and NAME_training_data.txt. Default: False", + default=False) + # group for EM - group_em = argparser_hmmratac.add_argument_group( "EM algorithm arguments" ) - group_em.add_argument( "--no-fragem", dest = "em_skip", action = "store_true", - help = "Do not perform EM training on the fragment distribution. If set, EM_MEANS and EM.STDDEVS will be used instead. Default: False", - default = False ) - group_em.add_argument( "--means", dest = "em_means", type = float, nargs = 4, - help = "Comma separated list of initial mean values for the fragment distribution for short fragments, mono-, di-, and tri-nucleosomal fragments. Default: 50 200 400 600", - default = [50, 200, 400, 600] ) - group_em.add_argument( "--stddevs", dest = "em_stddevs", type = float, nargs = 4, - help = "Comma separated list of initial standard deviation values for fragment distribution for short fragments, mono-, di-, and tri-nucleosomal fragments. Default: 20 20 20 20", - default = [20, 20, 20, 20] ) - group_em.add_argument( "--min-frag-p", dest = "min_frag_p", type = float, - help = "We will exclude the abnormal fragments that can't be assigned to any of the four signal tracks. After we use EM to find the means and stddevs of the four distributions, we will calculate the likelihood that a given fragment length fit any of the four using normal distribution. The criteria we will use is that if a fragment length has less than MIN_FRAG_P probability to be like either of short, mono, di, or tri-nuc fragment, we will exclude it while generating the four signal tracks for later HMM training and prediction. The value should be between 0 and 1. Larger the value, more abnormal fragments will be allowed. So if you want to include more 'ideal' fragments, make this value smaller. Default = 0.001", - default = 0.001 ) + group_em = argparser_hmmratac.add_argument_group("EM algorithm arguments") + group_em.add_argument("--no-fragem", dest="em_skip", action="store_true", + help="Do not perform EM training on the fragment distribution. If set, EM_MEANS and EM.STDDEVS will be used instead. Default: False", + default=False) + group_em.add_argument("--means", dest="em_means", type=float, nargs=4, + help="Comma separated list of initial mean values for the fragment distribution for short fragments, mono-, di-, and tri-nucleosomal fragments. Default: 50 200 400 600", + default=[50, 200, 400, 600]) + group_em.add_argument("--stddevs", dest="em_stddevs", type=float, nargs=4, + help="Comma separated list of initial standard deviation values for fragment distribution for short fragments, mono-, di-, and tri-nucleosomal fragments. Default: 20 20 20 20", + default=[20, 20, 20, 20]) + group_em.add_argument("--min-frag-p", dest="min_frag_p", type=float, + help="We will exclude the abnormal fragments that can't be assigned to any of the four signal tracks. After we use EM to find the means and stddevs of the four distributions, we will calculate the likelihood that a given fragment length fit any of the four using normal distribution. The criteria we will use is that if a fragment length has less than MIN_FRAG_P probability to be like either of short, mono, di, or tri-nuc fragment, we will exclude it while generating the four signal tracks for later HMM training and prediction. The value should be between 0 and 1. Larger the value, more abnormal fragments will be allowed. So if you want to include more 'ideal' fragments, make this value smaller. Default=0.001", + default=0.001) # group for HMM - group_hmm = argparser_hmmratac.add_argument_group( "Hidden Markov Model arguments" ) - #group_hmm.add_argument( "-s", "--states", dest = "hmm_states", type = int, - # help = "Number of States in the model. Default = 3. If not k=3, recommend NOT calling peaks, use bedgraph. This option is named as `--kmeans` in HMMRATAC since it will also control the number of clusters in the k-means clustering process to decide the initial emissions for HMM training.", - # default = 3 ) - group_hmm.add_argument( "--binsize", dest = "hmm_binsize", type = int, - help = "Size of the bins to split the pileup signals for training and decoding with Hidden Markov Model. Must >= 1. Smaller the binsize, higher the resolution of the results, slower the process. Default = 10", - default = 10 ) - group_hmm.add_argument( "-u", "--upper", dest = "hmm_upper", type = int, - help = "Upper limit on fold change range for choosing training sites. This is an important parameter for training so please read. The purpose of this parameter is to EXCLUDE those unusually highly enriched chromatin regions so we can get training samples in 'ordinary' regions instead. It's highly recommended to run the `--cutoff-analysis-only` first to decide the lower cutoff `-l`, the upper cutoff `-u`, and the pre-scanning cutoff `-c`. The upper cutoff should be the cutoff in the cutoff analysis result that can capture some (typically hundreds of) extremely high enrichment and unusually wide peaks. Default: 20", - default = 20 ) - group_hmm.add_argument( "-l", "--lower", dest = "hmm_lower", type = int, - help = "Lower limit on fold change range for choosing training sites. This is an important parameter for training so please read. The purpose of this parameter is to ONLY INCLUDE those chromatin regions having ordinary enrichment so we can get training samples to learn the common features through HMM. It's highly recommended to run the `--cutoff-analysis-only` first to decide the lower cutoff `-l`, the upper cutoff `-u`, and the pre-scanning cutoff `-c`. The lower cutoff should be the cutoff in the cutoff analysis result that can capture moderate number ( about 10k ) of peaks with normal width ( average length 500-1000bps long). Default: 10", - default = 10 ) - group_hmm.add_argument( "--maxTrain", dest = "hmm_maxTrain", type = int, - help = "Maximum number of training regions to use. After we identify the training regions between `-l` and `-u`, the lower and upper cutoffs, we will randomly pick this number of regions for training. Default: 1000", - default = 1000 ) - group_hmm.add_argument( "--training-flanking", dest = "hmm_training_flanking", type = int, required = False, - help = "Training regions will be expanded to both side with this number of basepairs. The purpose is to include more background regions. Default: 1000", - default = 1000 ) - group_hmm.add_argument( "-t", "--training", dest = "hmm_training_regions", type = str, required = False, - help = "Filename of training regions (previously was BED_file) to use for training HMM, instead of using foldchange settings to select. Default: NA" ) - #group_hmm.add_argument( "-z", "--zscore", dest = "hmm_zscore", type = int, - # help = "Zscored read depth to mask during Viterbi decoding. Default: 100", - # default = 100 ) - #group_hmm.add_argument( "--window", dest = "hmm_window", type = int, - # help = "Size of the bins to split the genome into for Viterbi decoding. To save memory, the genome is split into WINDOW long bins and viterbi decoding occurs across each bin. Default = 25000000. Note: For machines with limited memory, it is recommended to reduce the size of the bins.", - # default = 25000000 ) - group_hmm.add_argument( "--model", dest = "hmm_file", type = str, required = False, - help = "A JSON file generated from previous HMMRATAC run to use instead of creating new one. When provided, HMM training will be skipped. Default: NA" ) - group_hmm.add_argument( "--modelonly", dest = "hmm_modelonly", action = "store_true", default = False, - help = "Stop the program after generating model. Use this option to generate HMM model ONLY, which can be later applied with `--model`. Default: False") - group_hmm.add_argument( "--hmm-type", dest = "hmm_type", type = str, choices = ("gaussian", "poisson"), default = "gaussian", - help = "Use --hmm-type to select a Gaussian ('gaussian') or Poisson ('poisson') model for the hidden markov model in HMMRATAC. Default: 'gaussian'.") + group_hmm = argparser_hmmratac.add_argument_group("Hidden Markov Model arguments") + #group_hmm.add_argument("-s", "--states", dest="hmm_states", type=int, + # help="Number of States in the model. Default=3. If not k=3, recommend NOT calling peaks, use bedgraph. This option is named as `--kmeans` in HMMRATAC since it will also control the number of clusters in the k-means clustering process to decide the initial emissions for HMM training.", + # default=3) + group_hmm.add_argument("--binsize", dest="hmm_binsize", type=int, + help="Size of the bins to split the pileup signals for training and decoding with Hidden Markov Model. Must >= 1. Smaller the binsize, higher the resolution of the results, slower the process. Default=10", + default=10) + group_hmm.add_argument("-u", "--upper", dest="hmm_upper", type=int, + help="Upper limit on fold change range for choosing training sites. This is an important parameter for training so please read. The purpose of this parameter is to EXCLUDE those unusually highly enriched chromatin regions so we can get training samples in 'ordinary' regions instead. It's highly recommended to run the `--cutoff-analysis-only` first to decide the lower cutoff `-l`, the upper cutoff `-u`, and the pre-scanning cutoff `-c`. The upper cutoff should be the cutoff in the cutoff analysis result that can capture some (typically hundreds of) extremely high enrichment and unusually wide peaks. Default: 20", + default=20) + group_hmm.add_argument("-l", "--lower", dest="hmm_lower", type=int, + help="Lower limit on fold change range for choosing training sites. This is an important parameter for training so please read. The purpose of this parameter is to ONLY INCLUDE those chromatin regions having ordinary enrichment so we can get training samples to learn the common features through HMM. It's highly recommended to run the `--cutoff-analysis-only` first to decide the lower cutoff `-l`, the upper cutoff `-u`, and the pre-scanning cutoff `-c`. The lower cutoff should be the cutoff in the cutoff analysis result that can capture moderate number ( about 10k) of peaks with normal width ( average length 500-1000bps long). Default: 10", + default=10) + group_hmm.add_argument("--maxTrain", dest="hmm_maxTrain", type=int, + help="Maximum number of training regions to use. After we identify the training regions between `-l` and `-u`, the lower and upper cutoffs, we will randomly pick this number of regions for training. Default: 1000", + default=1000) + group_hmm.add_argument("--training-flanking", dest="hmm_training_flanking", type=int, required=False, + help="Training regions will be expanded to both side with this number of basepairs. The purpose is to include more background regions. Default: 1000", + default=1000) + group_hmm.add_argument("-t", "--training", dest="hmm_training_regions", type=str, required=False, + help="Filename of training regions (previously was BED_file) to use for training HMM, instead of using foldchange settings to select. Default: NA") + #group_hmm.add_argument("-z", "--zscore", dest="hmm_zscore", type=int, + # help="Zscored read depth to mask during Viterbi decoding. Default: 100", + # default=100) + #group_hmm.add_argument("--window", dest="hmm_window", type=int, + # help="Size of the bins to split the genome into for Viterbi decoding. To save memory, the genome is split into WINDOW long bins and viterbi decoding occurs across each bin. Default=25000000. Note: For machines with limited memory, it is recommended to reduce the size of the bins.", + # default=25000000) + group_hmm.add_argument("--model", dest="hmm_file", type=str, required=False, + help="A JSON file generated from previous HMMRATAC run to use instead of creating new one. When provided, HMM training will be skipped. Default: NA") + group_hmm.add_argument("--modelonly", dest="hmm_modelonly", action="store_true", default=False, + help="Stop the program after generating model. Use this option to generate HMM model ONLY, which can be later applied with `--model`. Default: False") + group_hmm.add_argument("--hmm-type", dest="hmm_type", type=str, choices=("gaussian", "poisson"), default="gaussian", + help="Use --hmm-type to select a Gaussian ('gaussian') or Poisson ('poisson') model for the hidden markov model in HMMRATAC. Default: 'gaussian'.") # group for peak calling arguments - group_call = argparser_hmmratac.add_argument_group( "Peak calling/HMM decoding arguments" ) - group_call.add_argument( "-c", "--prescan-cutoff", dest = "prescan_cutoff", type = float, - help = "The fold change cutoff for prescanning candidate regions in the whole dataset. Then we will use HMM to predict/decode states on these candidate regions. Higher the prescan cutoff, fewer regions will be considered. Must > 1. This is an important parameter for decoding so please read. The purpose of this parameter is to EXCLUDE those chromatin regions having noises/random enrichment so we can have a large number of possible regions to predict the HMM states. It's highly recommended to run the `--cutoff-analysis-only` first to decide the lower cutoff `-l`, the upper cutoff `-u`, and the pre-scanning cutoff `-c`. The pre-scanning cutoff should be the cutoff close to the BOTTOM of the cutoff analysis result that can capture large number of possible peaks with normal length (average length 500-1000bps). In most cases, please do not pick a cutoff too low that capture almost all the background noises from the data. Default: 1.2", - default = 1.2 ) - - group_call.add_argument( "--minlen", dest = "openregion_minlen", type = int, - help = "Minimum length of open region to call accessible regions. Must be larger than 0. If it is set as 0, it means no filtering on the length of the open regions called. Please note that, when bin size is small, setting a too small OPENREGION_MINLEN will bring a lot of false positives. Default: 100", - default = 100 ) - #group_call.add_argument( "--score", dest = "call_score", type = str, choices = ("max", "ave", "med", "fc", "zscore", "all"), - # help = "What type of score system to use for peaks. Can be used for ranking peaks. Default: max", - # default = "max" ) - #group_call.add_argument( "--threshold", dest = "call_threshold", type = float, - # help = "Threshold for reporting peaks. Only peaks who's score is >= this value will be reported. Default: 100", - # default = 100 ) + group_call = argparser_hmmratac.add_argument_group("Peak calling/HMM decoding arguments") + group_call.add_argument("-c", "--prescan-cutoff", dest="prescan_cutoff", type=float, + help="The fold change cutoff for prescanning candidate regions in the whole dataset. Then we will use HMM to predict/decode states on these candidate regions. Higher the prescan cutoff, fewer regions will be considered. Must > 1. This is an important parameter for decoding so please read. The purpose of this parameter is to EXCLUDE those chromatin regions having noises/random enrichment so we can have a large number of possible regions to predict the HMM states. It's highly recommended to run the `--cutoff-analysis-only` first to decide the lower cutoff `-l`, the upper cutoff `-u`, and the pre-scanning cutoff `-c`. The pre-scanning cutoff should be the cutoff close to the BOTTOM of the cutoff analysis result that can capture large number of possible peaks with normal length (average length 500-1000bps). In most cases, please do not pick a cutoff too low that capture almost all the background noises from the data. Default: 1.2", + default=1.2) + + group_call.add_argument("--minlen", dest="openregion_minlen", type=int, + help="Minimum length of open region to call accessible regions. Must be larger than 0. If it is set as 0, it means no filtering on the length of the open regions called. Please note that, when bin size is small, setting a too small OPENREGION_MINLEN will bring a lot of false positives. Default: 100", + default=100) + # group_call.add_argument("--score", dest="call_score", type=str, choices=("max", "ave", "med", "fc", "zscore", "all"), + # help="What type of score system to use for peaks. Can be used for ranking peaks. Default: max", + # default="max") + # group_call.add_argument("--threshold", dest="call_threshold", type=float, + # help="Threshold for reporting peaks. Only peaks who's score is >= this value will be reported. Default: 100", + # default=100) # group for misc - group_misc = argparser_hmmratac.add_argument_group( "Misc arguments" ) - group_misc.add_argument( "--pileup-short", dest = "pileup_short", action = "store_true", - help = "By default, HMMRATAC will pileup all fragments in order to identify regions for training and candidate regions for decoding. When this option is on, it will pileup only the short fragments to do so. Although it sounds a good idea since we assume that open region should have a lot of short fragments, it may be possible that the overall short fragments are too few to be useful. Default: False", - default = False ) - group_misc.add_argument( "--randomSeed", dest = "hmm_randomSeed", type = int, - help = "Seed to set for random sampling of training regions. Default: 10151", - default = 10151 ) - group_misc.add_argument( "--decoding-steps", dest = "decoding_steps", type = int, default = 1000, - help = "Number of candidate regions to be decoded at a time. The HMM model will be applied with Viterbi to find the optimal state path in each region. bigger the number, 'possibly' faster the decoding process, 'definitely' larger the memory usage. Default: 1000.") - group_misc.add_argument( "-e", "--blacklist", dest = "blacklist", type = str, required = False, - help = "Filename of blacklisted regions to exclude (previously was BED_file). Examples are those from ENCODE. Default: NA" ) - group_misc.add_argument( "--keep-duplicates", dest = "misc_keep_duplicates", action = "store_true", - help = "Keep duplicate reads from analysis. By default, duplicate reads will be removed. Default: False", - default = False ) - #group_misc.add_argument( "--trim", dest = "misc_trim", type = int, - # help = "How many signals from the end to trim off (ie starting with tri signal then di etc). This may be useful if your data doesn't contain many large fragments. Default: 0", - # default = 0 ) - #group_misc.add_argument( "-m", "--multiple-processing", dest = "np", type = int, - # help = "CPU used for mutliple processing. Please note that, assigning more CPUs does not guarantee the process being faster. Creating too many parrallel processes need memory operations and may negate benefit from multi processing. Default: 1", - # default = 1 ) - group_misc.add_argument( "--verbose", dest = "verbose", type = int, - help = "Set verbose level of runtime message. 0: only show critical message, 1: show additional warning message, 2: show process information, 3: show debug messages. DEFAULT:2", - default = 2 ) - #group_misc.add_argument( "-q", "--minmapq", dest = "min_map_quality", type = int, - # help = "Minimum mapping quality of reads to keep. Default: 30", - # default = 30 ) - group_misc.add_argument( "--buffer-size", dest = "buffer_size", type = int, default = "100000", - help = "Buffer size for incrementally increasing internal array size to store reads alignment information. In most cases, you don't have to change this parameter. However, if there are large number of chromosomes/contigs/scaffolds in your alignment, it's recommended to specify a smaller buffer size in order to decrease memory usage (but it will take longer time to read alignment files). Minimum memory requested for reading an alignment file is about # of CHROMOSOME * BUFFER_SIZE * 8 Bytes. DEFAULT: 100000 " ) - + group_misc = argparser_hmmratac.add_argument_group("Misc arguments") + group_misc.add_argument("--pileup-short", dest="pileup_short", action="store_true", + help="By default, HMMRATAC will pileup all fragments in order to identify regions for training and candidate regions for decoding. When this option is on, it will pileup only the short fragments to do so. Although it sounds a good idea since we assume that open region should have a lot of short fragments, it may be possible that the overall short fragments are too few to be useful. Default: False", + default=False) + group_misc.add_argument("--randomSeed", dest="hmm_randomSeed", type=int, + help="Seed to set for random sampling of training regions. Default: 10151", + default=10151) + group_misc.add_argument("--decoding-steps", dest="decoding_steps", type=int, default=1000, + help="Number of candidate regions to be decoded at a time. The HMM model will be applied with Viterbi to find the optimal state path in each region. bigger the number, 'possibly' faster the decoding process, 'definitely' larger the memory usage. Default: 1000.") + group_misc.add_argument("-e", "--blacklist", dest="blacklist", type=str, required=False, + help="Filename of blacklisted regions to exclude (previously was BED_file). Examples are those from ENCODE. Default: NA") + group_misc.add_argument("--keep-duplicates", dest="misc_keep_duplicates", action="store_true", + help="Keep duplicate reads from analysis. By default, duplicate reads will be removed. Default: False", + default=False) + # group_misc.add_argument("--trim", dest="misc_trim", type=int, + # help="How many signals from the end to trim off (ie starting with tri signal then di etc). This may be useful if your data doesn't contain many large fragments. Default: 0", + # default=0) + # group_misc.add_argument("-m", "--multiple-processing", dest="np", type=int, + # help="CPU used for mutliple processing. Please note that, assigning more CPUs does not guarantee the process being faster. Creating too many parrallel processes need memory operations and may negate benefit from multi processing. Default: 1", + # default=1) + group_misc.add_argument("--verbose", dest="verbose", type=int, + help="Set verbose level of runtime message. 0: only show critical message, 1: show additional warning message, 2: show process information, 3: show debug messages. DEFAULT:2", + default=2) + # group_misc.add_argument("-q", "--minmapq", dest="min_map_quality", type=int, + # help="Minimum mapping quality of reads to keep. Default: 30", + # default=30) + group_misc.add_argument("--buffer-size", dest="buffer_size", type=int, default="100000", + help="Buffer size for incrementally increasing internal array size to store reads alignment information. In most cases, you don't have to change this parameter. However, if there are large number of chromosomes/contigs/scaffolds in your alignment, it's recommended to specify a smaller buffer size in order to decrease memory usage (but it will take longer time to read alignment files). Minimum memory requested for reading an alignment file is about # of CHROMOSOME * BUFFER_SIZE * 8 Bytes. DEFAULT: 100000 ") + return @@ -984,4 +1000,4 @@ if __name__ == '__main__': except KeyboardInterrupt: sys.stderr.write("User interrupted me! ;-) Bye!\n") except MemoryError: - sys.stderr.write( "MemoryError occurred. If your input file has a large number of contigs/chromosomes, decrease the buffer_size value by setting --buffer-size option." ) + sys.stderr.write("MemoryError occurred. If your input file has a large number of contigs/chromosomes, decrease the buffer_size value by setting --buffer-size option.") diff --git a/pyproject.toml b/pyproject.toml index d349de11..1f4c6cad 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,3 +1,38 @@ [build-system] requires=['setuptools>=68.0', 'numpy>=1.25,<2.0.0', 'scipy>=1.12', 'cykhash>=2.0,<3.0', 'Cython>=3.0,<3.1', 'scikit-learn>=1.3', 'hmmlearn>=0.3.2'] +build-backend = "setuptools.build_meta" +[project] +name = "MACS3" +dynamic = ["version"] +description = "Model Based Analysis for ChIP-Seq data" +authors = [{name = "Tao Liu", email = "vladimir.liu@gmail.com"}, + {name = "Philippa Doherty", email = "Philippa.Doherty@RoswellPark.org"}] +readme = "README.md" +requires-python = ">=3.9" +classifiers =['Development Status :: 5 - Production/Stable', + 'Environment :: Console', + 'Intended Audience :: Developers', + 'Intended Audience :: Science/Research', + 'License :: OSI Approved :: BSD License', + 'Operating System :: MacOS :: MacOS X', + 'Operating System :: POSIX', + 'Operating System :: Unix', + 'Topic :: Scientific/Engineering :: Bio-Informatics', + 'Programming Language :: Python :: 3.9', + 'Programming Language :: Python :: 3.10', + 'Programming Language :: Python :: 3.11', + 'Programming Language :: Python :: 3.12', + 'Programming Language :: Cython'] +dependencies = ["numpy>=1.25,<2.0.0", + "scipy>=1.12", + "hmmlearn>=0.3.2", + "scikit-learn>=1.3", + "cykhash>=2.0,<3.0"] + +[project.urls] +Homepage = "https://https://macs3-project.github.io/MACS/" +Documentation = "https://https://macs3-project.github.io/MACS/" +Repository = "https://github.com/macs3-project/MACS/" +Issues = "https://github.com/macs3-project/MACS/issues" +Changelog = "https://github.com/macs3-project/MACS/blob/master/ChangeLog" diff --git a/setup.py b/setup.py index d0da4a75..20276d8d 100644 --- a/setup.py +++ b/setup.py @@ -20,41 +20,17 @@ # get MACS version exec(open("MACS3/Utilities/Constants.py").read()) -# classifiers -classifiers =[\ - 'Development Status :: 5 - Production/Stable', - 'Environment :: Console', - 'Intended Audience :: Developers', - 'Intended Audience :: Science/Research', - 'License :: OSI Approved :: BSD License', - 'Operating System :: MacOS :: MacOS X', - 'Operating System :: POSIX', - 'Operating System :: Unix', - 'Topic :: Scientific/Engineering :: Bio-Informatics', - 'Programming Language :: Python :: 3.9', - 'Programming Language :: Python :: 3.10', - 'Programming Language :: Python :: 3.11', - 'Programming Language :: Python :: 3.12', - 'Programming Language :: Cython', ] - -install_requires = [ "numpy>=1.25,<2.0.0", - "scipy>=1.12", - "hmmlearn>=0.3.2", - "scikit-learn>=1.3", - "cykhash>=2.0,<3.0"] - - def main(): - if sys.version_info < (3,9): + if sys.version_info < (3, 9): sys.stderr.write("CRITICAL: Python version must >= 3.9!\n") sys.exit(1) # NumPy include dir - numpy_include_dir = [ numpy.get_include() ] + numpy_include_dir = [numpy.get_include()] # CFLAG - extra_c_args = ["-w","-O3", "-g0"] + extra_c_args = ["-w", "-O3", "-g0"] # CFLAG for fermi-lite related codes clang = False @@ -68,14 +44,14 @@ def main(): if not clang: try: - gcc_version_check = subprocess.check_output( [os.environ.get('CC', 'gcc'), "--version"], universal_newlines=True) + gcc_version_check = subprocess.check_output([os.environ.get('CC', 'gcc'), "--version"], universal_newlines=True) if gcc_version_check.find("clang") != -1: clang = True else: - gcc_version_check = gcc_version_check.split('\n')[0] # get the first line - m = re.search(r"\s+(\d+\.\d+)\.\d+", gcc_version_check ) + gcc_version_check = gcc_version_check.split('\n')[0] # get the first line + m = re.search(r"\s+(\d+\.\d+)\.\d+", gcc_version_check) if m: - gcc_version = float( m[1] ) + gcc_version = float(m[1]) if gcc_version > 4.8: new_gcc = True except subprocess.CalledProcessError: @@ -87,67 +63,152 @@ def main(): except KeyError: pass - extra_c_args_for_fermi = ["-std=gnu99","-DUSE_SIMDE", "-DSIMDE_ENABLE_NATIVE_ALIASES"] + extra_c_args_for_fermi = ["-std=gnu99", "-DUSE_SIMDE", + "-DSIMDE_ENABLE_NATIVE_ALIASES"] + if icc or sysconfig.get_config_vars()['CC'] == 'icc': - extra_c_args_for_fermi.extend(['-qopenmp-simd', '-DSIMDE_ENABLE_OPENMP']) + extra_c_args_for_fermi.extend(['-qopenmp-simd', + '-DSIMDE_ENABLE_OPENMP']) elif new_gcc or clang or sysconfig.get_config_vars()['CC'] == 'clang': - extra_c_args_for_fermi.extend(['-fopenmp-simd', '-DSIMDE_ENABLE_OPENMP']) + extra_c_args_for_fermi.extend(['-fopenmp-simd', + '-DSIMDE_ENABLE_OPENMP']) # extensions, those have to be processed by Cython - ext_modules = [ \ - Extension("MACS3.Signal.HMMR_EM", ["MACS3/Signal/HMMR_EM.pyx"], libraries=["m"], include_dirs=numpy_include_dir, extra_compile_args=extra_c_args ), - Extension("MACS3.Signal.HMMR_Signal_Processing", ["MACS3/Signal/HMMR_Signal_Processing.pyx"], libraries=["m"], include_dirs=numpy_include_dir, extra_compile_args=extra_c_args ), - Extension("MACS3.Signal.HMMR_HMM", ["MACS3/Signal/HMMR_HMM.pyx"], libraries=["m"], include_dirs=numpy_include_dir, extra_compile_args=extra_c_args ), - Extension("MACS3.Signal.Prob", ["MACS3/Signal/Prob.pyx"], libraries=["m"], include_dirs=numpy_include_dir, extra_compile_args=extra_c_args ), - Extension("MACS3.Signal.Region", ["MACS3/Signal/Region.pyx"], include_dirs=numpy_include_dir, extra_compile_args=extra_c_args ), - Extension("MACS3.Signal.Pileup", ["MACS3/Signal/Pileup.pyx","MACS3/Signal/cPosValCalculation.c"], include_dirs=numpy_include_dir, extra_compile_args=extra_c_args ), - Extension("MACS3.Signal.PileupV2", ["MACS3/Signal/PileupV2.pyx"], include_dirs=numpy_include_dir, extra_compile_args=extra_c_args ), - Extension("MACS3.Signal.PeakModel", ["MACS3/Signal/PeakModel.pyx"], include_dirs=numpy_include_dir, extra_compile_args=extra_c_args), - Extension("MACS3.Signal.PeakDetect", ["MACS3/Signal/PeakDetect.pyx"], extra_compile_args=extra_c_args), - Extension("MACS3.Signal.SignalProcessing", ["MACS3/Signal/SignalProcessing.pyx"], include_dirs=numpy_include_dir, extra_compile_args=extra_c_args), - Extension("MACS3.Signal.FixWidthTrack", ["MACS3/Signal/FixWidthTrack.pyx"], include_dirs=numpy_include_dir, extra_compile_args=extra_c_args), - Extension("MACS3.Signal.PairedEndTrack", ["MACS3/Signal/PairedEndTrack.pyx"], include_dirs=numpy_include_dir, extra_compile_args=extra_c_args), - Extension("MACS3.Signal.BedGraph", ["MACS3/Signal/BedGraph.pyx"], libraries=["m"], include_dirs=numpy_include_dir, extra_compile_args=extra_c_args), - #Extension("MACS3.Signal.BedGraphV2", ["MACS3/Signal/BedGraphV2.pyx"], libraries=["m"], include_dirs=numpy_include_dir, extra_compile_args=extra_c_args), - Extension("MACS3.Signal.ScoreTrack", ["MACS3/Signal/ScoreTrack.pyx"], libraries=["m"], include_dirs=numpy_include_dir, extra_compile_args=extra_c_args ), - Extension("MACS3.Signal.CallPeakUnit", ["MACS3/Signal/CallPeakUnit.pyx"], libraries=["m"], include_dirs=numpy_include_dir, extra_compile_args=extra_c_args), - Extension("MACS3.Signal.VariantStat",["MACS3/Signal/VariantStat.pyx",],libraries=["m"], include_dirs=numpy_include_dir, extra_compile_args=extra_c_args), - Extension("MACS3.Signal.ReadAlignment",["MACS3/Signal/ReadAlignment.pyx",],libraries=["m"],include_dirs=numpy_include_dir, extra_compile_args=extra_c_args), - Extension("MACS3.Signal.RACollection",["MACS3/Signal/RACollection.pyx","MACS3/fermi-lite/bfc.c","MACS3/fermi-lite/bseq.c",\ - "MACS3/fermi-lite/bubble.c","MACS3/fermi-lite/htab.c","MACS3/fermi-lite/ksw.c","MACS3/fermi-lite/kthread.c",\ - "MACS3/fermi-lite/mag.c","MACS3/fermi-lite/misc.c","MACS3/fermi-lite/mrope.c","MACS3/fermi-lite/rld0.c",\ - "MACS3/fermi-lite/rle.c","MACS3/fermi-lite/rope.c","MACS3/fermi-lite/unitig.c", "MACS3/Signal/swalign.c" ], \ - libraries=["m","z"], include_dirs=numpy_include_dir+["./","./MACS3/fermi-lite/","./MACS3/Signal/"], extra_compile_args=extra_c_args+extra_c_args_for_fermi), - Extension("MACS3.Signal.UnitigRACollection",["MACS3/Signal/UnitigRACollection.pyx"],libraries=["m"], include_dirs=numpy_include_dir, extra_compile_args=extra_c_args), - Extension("MACS3.Signal.PosReadsInfo",["MACS3/Signal/PosReadsInfo.pyx",],libraries=["m"], include_dirs=numpy_include_dir, extra_compile_args=extra_c_args), - Extension("MACS3.Signal.PeakVariants",["MACS3/Signal/PeakVariants.pyx",],libraries=["m"], include_dirs=numpy_include_dir, extra_compile_args=extra_c_args), - # IO - Extension("MACS3.IO.Parser",["MACS3/IO/Parser.pyx"], include_dirs=numpy_include_dir, extra_compile_args=extra_c_args), - Extension("MACS3.IO.PeakIO", ["MACS3/IO/PeakIO.pyx"], extra_compile_args=extra_c_args), - Extension("MACS3.IO.BedGraphIO", ["MACS3/IO/BedGraphIO.pyx"], extra_compile_args=extra_c_args), - Extension("MACS3.IO.BAM",["MACS3/IO/BAM.pyx",],libraries=["m"], include_dirs=numpy_include_dir, extra_compile_args=extra_c_args) ] - - - with open("README.md", "r") as fh: - long_description = fh.read() - - setup( name = "MACS3", - version = MACS_VERSION, - description = "Model Based Analysis for ChIP-Seq data", - long_description = long_description, - long_description_content_type = "text/markdown", - author = 'Tao Liu', - author_email = 'vladimir.liu@gmail.com', - url = 'http://github.com/macs3-project/MACS/', - package_dir = {'MACS3' : 'MACS3'}, - packages = ['MACS3', 'MACS3.IO', 'MACS3.Signal', 'MACS3.Commands','MACS3.Utilities'], - package_data = {'MACS3':['*.pxd']}, - scripts = ['bin/macs3', ], - classifiers = classifiers, - install_requires = install_requires, - python_requires = '>=3.9', - ext_modules=cythonize( ext_modules ) ), -# compiler_directives={'linetrace': True, 'binding': True}) ) + ext_modules = [Extension("MACS3.Signal.HMMR_EM", + ["MACS3/Signal/HMMR_EM.pyx"], + libraries=["m"], + include_dirs=numpy_include_dir, + extra_compile_args=extra_c_args), + Extension("MACS3.Signal.HMMR_Signal_Processing", + ["MACS3/Signal/HMMR_Signal_Processing.pyx"], + libraries=["m"], + include_dirs=numpy_include_dir, + extra_compile_args=extra_c_args), + Extension("MACS3.Signal.HMMR_HMM", + ["MACS3/Signal/HMMR_HMM.pyx"], + libraries=["m"], + include_dirs=numpy_include_dir, + extra_compile_args=extra_c_args), + Extension("MACS3.Signal.Prob", + ["MACS3/Signal/Prob.pyx"], + libraries=["m"], + include_dirs=numpy_include_dir, + extra_compile_args=extra_c_args), + Extension("MACS3.Signal.Region", + ["MACS3/Signal/Region.pyx"], + include_dirs=numpy_include_dir, + extra_compile_args=extra_c_args), + Extension("MACS3.Signal.Pileup", + ["MACS3/Signal/Pileup.pyx", + "MACS3/Signal/cPosValCalculation.c"], + include_dirs=numpy_include_dir, + extra_compile_args=extra_c_args), + Extension("MACS3.Signal.PileupV2", + ["MACS3/Signal/PileupV2.pyx"], + include_dirs=numpy_include_dir, + extra_compile_args=extra_c_args), + Extension("MACS3.Signal.PeakModel", + ["MACS3/Signal/PeakModel.pyx"], + include_dirs=numpy_include_dir, + extra_compile_args=extra_c_args), + Extension("MACS3.Signal.PeakDetect", + ["MACS3/Signal/PeakDetect.pyx"], + extra_compile_args=extra_c_args), + Extension("MACS3.Signal.SignalProcessing", + ["MACS3/Signal/SignalProcessing.pyx"], + include_dirs=numpy_include_dir, + extra_compile_args=extra_c_args), + Extension("MACS3.Signal.FixWidthTrack", + ["MACS3/Signal/FixWidthTrack.pyx"], + include_dirs=numpy_include_dir, + extra_compile_args=extra_c_args), + Extension("MACS3.Signal.PairedEndTrack", + ["MACS3/Signal/PairedEndTrack.pyx"], + include_dirs=numpy_include_dir, + extra_compile_args=extra_c_args), + Extension("MACS3.Signal.BedGraph", + ["MACS3/Signal/BedGraph.pyx"], + libraries=["m"], + include_dirs=numpy_include_dir, + extra_compile_args=extra_c_args), + Extension("MACS3.Signal.ScoreTrack", + ["MACS3/Signal/ScoreTrack.pyx"], + libraries=["m"], + include_dirs=numpy_include_dir, + extra_compile_args=extra_c_args), + Extension("MACS3.Signal.CallPeakUnit", + ["MACS3/Signal/CallPeakUnit.pyx"], + libraries=["m"], + include_dirs=numpy_include_dir, + extra_compile_args=extra_c_args), + Extension("MACS3.Signal.VariantStat", + ["MACS3/Signal/VariantStat.pyx"], + libraries=["m"], + include_dirs=numpy_include_dir, + extra_compile_args=extra_c_args), + Extension("MACS3.Signal.ReadAlignment", + ["MACS3/Signal/ReadAlignment.pyx"], + libraries=["m"], + include_dirs=numpy_include_dir, + extra_compile_args=extra_c_args), + Extension("MACS3.Signal.RACollection", + ["MACS3/Signal/RACollection.pyx", + "MACS3/fermi-lite/bfc.c", + "MACS3/fermi-lite/bseq.c", + "MACS3/fermi-lite/bubble.c", + "MACS3/fermi-lite/htab.c", + "MACS3/fermi-lite/ksw.c", + "MACS3/fermi-lite/kthread.c", + "MACS3/fermi-lite/mag.c", + "MACS3/fermi-lite/misc.c", + "MACS3/fermi-lite/mrope.c", + "MACS3/fermi-lite/rld0.c", + "MACS3/fermi-lite/rle.c", + "MACS3/fermi-lite/rope.c", + "MACS3/fermi-lite/unitig.c", + "MACS3/Signal/swalign.c"], + libraries=["m", "z"], + include_dirs=numpy_include_dir+["./", + "./MACS3/fermi-lite/", + "./MACS3/Signal/"], + extra_compile_args=extra_c_args+extra_c_args_for_fermi), + Extension("MACS3.Signal.UnitigRACollection", + ["MACS3/Signal/UnitigRACollection.pyx"], + libraries=["m"], + include_dirs=numpy_include_dir, + extra_compile_args=extra_c_args), + Extension("MACS3.Signal.PosReadsInfo", + ["MACS3/Signal/PosReadsInfo.pyx"], + libraries=["m"], + include_dirs=numpy_include_dir, + extra_compile_args=extra_c_args), + Extension("MACS3.Signal.PeakVariants", + ["MACS3/Signal/PeakVariants.pyx"], + libraries=["m"], + include_dirs=numpy_include_dir, + extra_compile_args=extra_c_args), + Extension("MACS3.IO.Parser", + ["MACS3/IO/Parser.pyx"], + include_dirs=numpy_include_dir, + extra_compile_args=extra_c_args), + Extension("MACS3.IO.PeakIO", + ["MACS3/IO/PeakIO.pyx"], + extra_compile_args=extra_c_args), + Extension("MACS3.IO.BedGraphIO", + ["MACS3/IO/BedGraphIO.pyx"], + extra_compile_args=extra_c_args), + Extension("MACS3.IO.BAM", + ["MACS3/IO/BAM.pyx",], libraries=["m"], + include_dirs=numpy_include_dir, + extra_compile_args=extra_c_args)] + + setup(version=MACS_VERSION, + package_dir={'MACS3': 'MACS3'}, + packages=['MACS3', 'MACS3.IO', 'MACS3.Signal', 'MACS3.Commands', 'MACS3.Utilities'], + package_data={'MACS3': ['*.pxd']}, + scripts=['bin/macs3', ], + ext_modules=cythonize(ext_modules)) + if __name__ == '__main__': main()