Skip to content

Commit

Permalink
add function in Region.py to find intersection;made a jaccard.py to c…
Browse files Browse the repository at this point in the history
…acluate jaccard index of two peak files; use the new jaccard.py script to check if two peak files are similar
  • Loading branch information
taoliu committed Feb 12, 2024
1 parent ea37e3e commit fca1328
Show file tree
Hide file tree
Showing 4 changed files with 213 additions and 24 deletions.
128 changes: 114 additions & 14 deletions MACS3/Signal/Region.pyx
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# cython: language_level=3
# cython: profile=True
# Time-stamp: <2022-09-15 16:13:48 Tao Liu>
# Time-stamp: <2024-02-12 15:26:24 Tao Liu>

"""Module for Region classe.
Expand Down Expand Up @@ -48,11 +48,13 @@ cdef class Regions:
cdef:
public dict regions
public int total
bool __flag_sorted
bool __sorted
bool __merged

def __init__ (self):
self.regions= {}
self.__flag_sorted = False
self.__sorted = False
self.__merged = False
self.total = 0

def __getitem__ (self, chrom):
Expand Down Expand Up @@ -119,23 +121,43 @@ cdef class Regions:
else:
self.regions[chrom] = [ (start,end), ]
self.total += 1
self.__flag_sorted = False
self.__sorted = False
self.__merged = False
return

cpdef sort (self):
cdef:
bytes chrom

if self.__flag_sorted:
if self.__sorted:
return
for chrom in sorted(self.regions.keys()):
self.regions[chrom].sort()
self.__flag_sorted = True
self.__sorted = True

cpdef long total_length ( self ):
""" Return the total length of the Regions object.
"""
cdef:
bytes chrom
list ps
int i
int tl, s, e
self.merge_overlap()
tl = 0
for chrom in sorted(self.regions.keys()):
ps = self.regions[chrom]
for i in range( len(ps) ):
s, e = ps[i]
tl += e - s
return tl

cpdef set get_chr_names (self):
return set( sorted(self.regions.keys()) )

cpdef expand ( self, int flanking ):
""" Expand regions to both directions with 'flanking' bps.
"""
cdef:
bytes chrom
list ps
Expand All @@ -148,10 +170,11 @@ cdef class Regions:
ps[i] = ( max(0, ps[i][0] - flanking), ps[i][1] + flanking )
ps.sort()
self.regions[chrom] = ps
self.__merged = False

cpdef merge_overlap ( self ):
"""
merge overlapping regions
Merge overlapping regions of itself.
"""
cdef:
bytes chrom
Expand All @@ -161,6 +184,8 @@ cdef class Regions:
list regions_chr
tuple prev_region

if self.__merged:
return
self.sort()
regions = self.regions
new_regions = {}
Expand Down Expand Up @@ -191,6 +216,7 @@ cdef class Regions:
self.total += len( new_regions[chrom] )
self.regions = new_regions
self.sort()
self.__merged = True
return True

cpdef write_to_bed (self, fhd ):
Expand Down Expand Up @@ -243,28 +269,102 @@ cdef class Regions:
# for p in all_pc:
# ret_peakio.add_PeakContent ( p["chrom"], p )
# return ret_peakio


cpdef object intersect (self, object regions_object2):
""" Get the only intersecting regions comparing with
regions_object2, another Regions object. Then return a new
Regions object.
"""
cdef:
object ret_regions_object
dict regions1, regions2
list chrs1, chrs2, four_coords
bytes k
dict ret_regions
bool overlap_found
tuple r1, r2
long n_rl1, n_rl2

self.sort()
regions1 = self.regions
self.total = 0
assert isinstance(regions_object2, Regions)

regions_object2.sort()
regions2 = regions_object2.regions

ret_regions_object = Regions()
ret_regions = dict()
chrs1 = list(regions1.keys())
chrs2 = list(regions2.keys())
for k in chrs1:
#print(f"chromosome {k}")
if not chrs2.count(k):
# no such chromosome in peaks1, then don't touch the peaks in this chromosome
ret_regions[ k ] = regions1[ k ]
self.total += len( ret_regions[ k ] )
continue
ret_regions[ k ] = []
n_rl1 = len( regions1[k] ) # number of remaining elements in regions1[k]
n_rl2 = len( regions2[k] ) # number of remaining elements in regions2[k]
rl1_k = iter( regions1[k] ).__next__
rl2_k = iter( regions2[k] ).__next__
r1 = rl1_k()
n_rl1 -= 1
r2 = rl2_k()
n_rl2 -= 1
while ( True ):
# we do this until there is no r1 or r2 left.
if r2[0] < r1[1] and r1[0] < r2[1]:
# We found an overlap, now get the intersecting
# region.
four_coords = sorted([r1[0], r1[1], r2[0], r2[1]])
ret_regions[ k ].append( (four_coords[1], four_coords[2]) )
if r1[1] < r2[1]:
# in this case, we need to move to the next r1,
if n_rl1:
r1 = rl1_k()
n_rl1 -= 1
else:
# no more r1 left
break
else:
# in this case, we need to move the next r2
if n_rl2:
r2 = rl2_k()
n_rl2 -= 1
else:
# no more r2 left
break
self.total += len( ret_regions[ k ] )

ret_regions_object.regions = ret_regions
ret_regions_object.sort()
return ret_regions_object


cpdef void exclude (self, object regionio2):
""" Remove overlapping regions in regionio2, another Region
cpdef void exclude (self, object regions_object2):
""" Remove overlapping regions in regions_object2, another Regions
object.
"""
cdef:
dict regions1, regions2
list chrs1, chrs2
bytes k
dict ret_regionss
dict ret_regions
bool overlap_found
tuple r1, r2
long n_rl1, n_rl2

self.sort()
regions1 = self.regions
self.total = 0
assert isinstance(regionio2,Regions)
regionio2.sort()
regions2 = regionio2.regions
assert isinstance(regions_object2,Regions)
regions_object2.sort()
regions2 = regions_object2.regions

ret_regions = dict()
chrs1 = list(regions1.keys())
Expand Down Expand Up @@ -324,6 +424,6 @@ cdef class Regions:
self.total += len( ret_regions[ k ] )

self.regions = ret_regions
self.__flag_sorted = False
self.__sorted = False
self.sort()
return
31 changes: 22 additions & 9 deletions test/cmdlinetest
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/bin/bash
# Time-stamp: <2023-08-02 17:59:43 Tao Liu>
# Time-stamp: <2024-02-12 15:49:21 Tao Liu>

# integrative subcmds testing

Expand Down Expand Up @@ -266,15 +266,28 @@ for i in ${subfolders[@]};do
cp $fq $fs
echo " ... replaced!"
else
sort $fq| grep -v ^# > tmp_fq.txt
sort $fs| grep -v ^# > tmp_fs.txt
d=`diff tmp_fq.txt tmp_fs.txt`
if [ -z "$d" ]; then
echo " ... success!"
# here we will check if the files are the same/similar
if [[ "$fq" == *.bed || "$fq" == *.gappedPeak || "$fq" == *.narrowPeak || "$fq" == *.bed12 ]]; then
# for peaks, we will use our jaccard index script to check
ji=$(python ./jaccard.py "$fq" "$fs")
# Compare the output value
if (( $(echo "$ji > 0.99" | bc -l) )); then
echo " ... success!"
else
echo " ... failed! jaccard index = $ji"
let "flag=1"
fi
else
echo " ... failed! Difference:"
diff tmp_fq.txt tmp_fs.txt | head -10
let "flag=1";
sort $fq| grep -v ^# > tmp_fq.txt
sort $fs| grep -v ^# > tmp_fs.txt
d=`diff tmp_fq.txt tmp_fs.txt`
if [ -z "$d" ]; then
echo " ... success!"
else
echo " ... failed! Difference:"
diff tmp_fq.txt tmp_fs.txt | head -10
let "flag=1";
fi
fi
fi
done
Expand Down
42 changes: 42 additions & 0 deletions test/jaccard.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#!/usr/bin/env python
# Time-stamp: <2024-02-12 15:21:42 Tao Liu>

import sys

from MACS3.Signal.Region import *


# ------------------------------------
# Main function
# ------------------------------------
def main():
if len(sys.argv) < 3:
sys.stderr.write("Calculate Jaccard Index of two peak files and return the index.\n\nneed 2 paras: %s <peak1> <peak2>\n" % sys.argv[0])
sys.exit(1)

#options.info("# Read peak file 1...")
peak1 = open( sys.argv[1] )
regions1 = Regions()
for l in peak1:
fs = l.rstrip().split()
regions1.add_loc( fs[0].encode(), int(fs[1]), int(fs[2]) )
regions1.sort()

#options.info("# Read peak file 2...")
peak2 = open( sys.argv[2] )
regions2 = Regions()
for l in peak2:
fs = l.rstrip().split()
regions2.add_loc( fs[0].encode(), int(fs[1]), int(fs[2]) )
regions2.sort()

tl1 = regions1.total_length()
tl2 = regions2.total_length()
inter = regions1.intersect(regions2)
tl_inter = inter.total_length()

jaccard_index = tl_inter / (tl1 + tl2 - tl_inter)
print (jaccard_index)

if __name__ == '__main__':
main()
36 changes: 35 additions & 1 deletion test/test_Region.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/usr/bin/env python
# Time-stamp: <2022-09-14 14:32:12 Tao Liu>
# Time-stamp: <2024-02-12 15:23:48 Tao Liu>

import unittest
import sys
Expand All @@ -18,6 +18,8 @@ def setUp( self ):
(b"chrY",600,800),
(b"chrY",1200,1300),
]
# when we add test_regions1 and test_region2 into the same
# Regions, we should get this from 'merge_overlaps'
self.merge_result_regions = [ (b"chrY",0,200),
(b"chrY",300,500),
(b"chrY",600,900),
Expand All @@ -38,6 +40,26 @@ def setUp( self ):
"chr1\t1000\t1200\nchrY\t100\t200\nchrY\t300\t400\n",
"chrY\t600\t800\nchrY\t1200\t1300\n"]

# test_regions4 and test_regions5 are used to test the intersection
self.test_regions4 = [(b"chrY",0,100),
(b"chrY",300,500),
(b"chrY",700,900),
(b"chrY",1000,1200)
]
self.test_regions5 = [(b"chrY",100,200),
(b"chrY",300,400),
(b"chrY",600,800),
(b"chrY",1100,1150),
(b"chrY",1175,1300)
]
# After we add test_regions4 and test_region5 into two Regions
# objects, we should get this from 'intersect'
self.intersect_regions_4_vs_5 = { b"chrY": [ (300, 400),
(700, 800),
(1100,1150),
(1175,1200) ] }


def test_add_loc1( self ):
# make sure the shuffled sequence does not lose any elements
self.r1 = Regions()
Expand Down Expand Up @@ -68,3 +90,15 @@ def test_pop3( self ):
while self.r.total != 0:
ret_list_regions.append( str( self.r.pop( 3 ) ) )
self.assertEqual( ret_list_regions, self.popped_regions )

def test_intersect( self ):
self.r4 = Regions()
for a in self.test_regions4:
self.r4.add_loc(a[0],a[1],a[2])
self.r5 = Regions()
for a in self.test_regions5:
self.r5.add_loc(a[0],a[1],a[2])
self.intersect_4_vs_5 = self.r4.intersect( self.r5 )
self.assertEqual( self.intersect_4_vs_5.regions, self.intersect_regions_4_vs_5 )
self.intersect_4_vs_5 = self.r5.intersect( self.r4 )
self.assertEqual( self.intersect_4_vs_5.regions, self.intersect_regions_4_vs_5 )

0 comments on commit fca1328

Please sign in to comment.