diff --git a/MACS3/Signal/BedGraph.pyx b/MACS3/Signal/BedGraph.py
similarity index 62%
rename from MACS3/Signal/BedGraph.pyx
rename to MACS3/Signal/BedGraph.py
index df57c4f7..2abc6175 100644
--- a/MACS3/Signal/BedGraph.pyx
+++ b/MACS3/Signal/BedGraph.py
@@ -1,6 +1,6 @@
 # cython: language_level=3
 # cython: profile=True
-# Time-stamp: <2024-05-15 19:27:06 Tao Liu>
+# Time-stamp: <2024-10-14 19:32:34 Tao Liu>
 
 """Module for BedGraph data class.
 
@@ -12,68 +12,81 @@
 # ------------------------------------
 # python modules
 # ------------------------------------
-#from array import array
-from cpython cimport array
+import cython
 from array import array as pyarray
 from math import prod
 # ------------------------------------
 # MACS3 modules
 # ------------------------------------
 from MACS3.Signal.ScoreTrack import ScoreTrackII
-from MACS3.IO.PeakIO import PeakIO, BroadPeakIO
+from MACS3.IO.PeakIO import PeakIO, BroadPeakIO, PeakContent
 from MACS3.Signal.Prob import chisq_logp_e
 
 # ------------------------------------
 # Other modules
 # ------------------------------------
 
-from cpython cimport bool
+from cython.cimports.cpython import bool
 import numpy as np
-cimport numpy as np
-from numpy cimport uint8_t, uint16_t, uint32_t, uint64_t, int8_t, int16_t, int32_t, int64_t, float32_t, float64_t
+import cython.cimports.numpy as cnp
 
 # ------------------------------------
 # C lib
 # ------------------------------------
 
-from libc.math cimport sqrt, log, log1p, exp, log10
+from cython.cimports.libc.math import sqrt, log10
 
 # ------------------------------------
 # constants
 # ------------------------------------
-__version__ = "BedGraph $Revision$"
-__author__ = "Tao Liu <vladimir.liu@gmail.com>"
-__doc__ = "bedGraphTrackI class"
+LOG10_E = 0.43429448190325176
 
 # ------------------------------------
 # Misc functions
 # ------------------------------------
-LOG10_E = 0.43429448190325176
 
-cdef inline mean_func( x ):
-    return sum( x )/len( x )
 
-cdef inline fisher_func( x ):
+@cython.inline
+@cython.cfunc
+def mean_func(x):
+    return sum(x)/len(x)
+
+
+@cython.inline
+@cython.cfunc
+def fisher_func(x):
     # combine -log10pvalues
-    return chisq_logp_e( 2*sum (x )/LOG10_E, 2*len( x ), log10=True )
+    return chisq_logp_e(2*sum(x)/LOG10_E, 2*len(x), log10=True)
+
 
-cdef inline subtract_func( x ):
+@cython.inline
+@cython.cfunc
+def subtract_func(x):
     # subtraction of two items list
     return x[1] - x[0]
 
-cdef inline divide_func( x ):
+
+@cython.inline
+@cython.cfunc
+def divide_func(x):
     # division of two items list
     return x[1] / x[2]
 
-cdef inline product_func( x ):
+
+@cython.inline
+@cython.cfunc
+def product_func(x):
     # production of a list of values
     # only python 3.8 or above
-    return prod( x )
-    
+    return prod(x)
+
 # ------------------------------------
 # Classes
 # ------------------------------------
-cdef class bedGraphTrackI:
+
+
+@cython.cclass
+class bedGraphTrackI:
     """Class for bedGraph type data.
 
     In bedGraph, data are represented as continuous non-overlapping
@@ -94,13 +107,12 @@
     this class is 0-indexed and right-open.
 
     """
-    cdef:
-        dict __data
-        public float32_t maxvalue
-        public float32_t minvalue
-        public float32_t baseline_value
+    __data: dict
+    maxvalue = cython.declare(cython.float, visibility="public")
+    minvalue = cython.declare(cython.float, visibility="public")
+    baseline_value = cython.declare(cython.float, visibility="public")
 
-    def __init__ (self, float32_t baseline_value=0 ):
+    def __init__(self, baseline_value: cython.float = 0):
         """
         baseline_value is the value to fill in the regions not defined
         in bedGraph. For example, if the bedGraph is like:
@@ -112,11 +124,15 @@ def __init__ (self, float32_t baseline_value=0 ):
 
         """
         self.__data = {}
-        self.maxvalue = -10000000 # initial maximum value is tiny since I want safe_add_loc to update it
+        self.maxvalue = -10000000  # initial maximum value is tiny since I want safe_add_loc to update it
         self.minvalue = 10000000  # initial minimum value is large since I want safe_add_loc to update it
         self.baseline_value = baseline_value
 
-    cpdef add_loc ( self, bytes chromosome, int32_t startpos, int32_t endpos, float32_t value):
+    @cython.ccall
+    def add_loc(self, chromosome: bytes,
+                startpos: cython.int,
+                endpos: cython.int,
+                value: cython.float):
         """Add a chr-start-end-value block into __data dictionary.
 
         Note, we don't check if the add_loc is called continuously on
@@ -124,7 +140,8 @@ def __init__ (self, float32_t baseline_value=0 ):
         this function within MACS.
 
         """
-        cdef float32_t pre_v
+        pre_v: cython.float
+
         # basic assumption, end pos should > start pos
 
         if endpos <= 0:
@@ -133,7 +150,8 @@ def __init__ (self, float32_t baseline_value=0 ):
             startpos = 0
 
         if chromosome not in self.__data:
-            self.__data[chromosome] = [ pyarray('i',[]), pyarray('f',[]) ]
+            self.__data[chromosome] = [pyarray('i', []),
+                                       pyarray('f', [])]
             c = self.__data[chromosome]
             if startpos:
                 # start pos is not 0, then add two blocks, the first
@@ -145,7 +163,7 @@ def __init__ (self, float32_t baseline_value=0 ):
         else:
             c = self.__data[chromosome]
             # get the preceding region
-            pre_v   = c[1][-1]
+            pre_v = c[1][-1]
 
             # if this region is next to the previous one.
             if pre_v == value:
@@ -161,7 +179,11 @@ def __init__ (self, float32_t baseline_value=0 ):
         if value < self.minvalue:
             self.minvalue = value
 
-    cpdef add_loc_wo_merge ( self, bytes chromosome, int32_t startpos, int32_t endpos, float32_t value):
+    @cython.ccall
+    def add_loc_wo_merge(self, chromosome: bytes,
+                         startpos: cython.int,
+                         endpos: cython.int,
+                         value: cython.float):
         """Add a chr-start-end-value block into __data dictionary.
 
         Note, we don't check if the add_loc is called continuously on
@@ -177,9 +199,10 @@ def __init__ (self, float32_t baseline_value=0 ):
 
         if value < self.baseline_value:
             value = self.baseline_value
-            
+
         if chromosome not in self.__data:
-            self.__data[chromosome] = [ pyarray('i',[]), pyarray('f',[]) ]
+            self.__data[chromosome] = [pyarray('i', []),
+                                       pyarray('f', [])]
             c = self.__data[chromosome]
             if startpos:
                 # start pos is not 0, then add two blocks, the first
@@ -194,7 +217,11 @@ def __init__ (self, float32_t baseline_value=0 ):
         if value < self.minvalue:
             self.minvalue = value
 
-    cpdef add_chrom_data( self, bytes chromosome, object p, object v ):
+    @cython.ccall
+    def add_chrom_data(self,
+                       chromosome: bytes,
+                       p: pyarray,
+                       v: pyarray):
         """Add a pv data to a chromosome. Replace the previous data.
 
         p: a pyarray object 'i' for positions
@@ -202,19 +229,22 @@ def __init__ (self, float32_t baseline_value=0 ):
 
         Note: no checks for error, use with caution
         """
-        cdef:
-            float32_t maxv, minv
+        maxv: cython.float
+        minv: cython.float
 
-        self.__data[ chromosome ] = [ p, v ]
-        maxv = max( v )
-        minv = min( v )
+        self.__data[chromosome] = [p, v]
+        maxv = max(v)
+        minv = min(v)
         if maxv > self.maxvalue:
             self.maxvalue = maxv
         if minv < self.minvalue:
             self.minvalue = minv
         return
 
-    cpdef add_chrom_data_hmmr_PV( self, bytes chromosome, object pv ):
+    @cython.ccall
+    def add_chrom_data_PV(self,
+                          chromosome: bytes,
+                          pv: cnp.ndarray):
         """Add a pv data to a chromosome. Replace the previous data.
 
         This is a kinda silly function to waste time and convert a PV
@@ -223,11 +253,11 @@ def __init__ (self, float32_t baseline_value=0 ):
 
         Note: no checks for error, use with caution
         """
-        cdef:
-            float32_t maxv, minv
-            int32_t i
+        maxv: cython.float
+        minv: cython.float
 
-        self.__data[ chromosome ] = [ pyarray('i', pv['p']), pyarray('f',pv['v']) ]
+        self.__data[chromosome] = [pyarray('i', pv['p']),
+                                   pyarray('f', pv['v'])]
         minv = pv['v'].min()
         maxv = pv['p'].max()
         if maxv > self.maxvalue:
@@ -235,13 +265,13 @@ def __init__ (self, float32_t baseline_value=0 ):
         if minv < self.minvalue:
             self.minvalue = minv
         return
-    
-    cpdef bool destroy ( self ):
+
+    @cython.ccall
+    def destroy(self) -> bool:
         """ destroy content, free memory.
         """
-        cdef:
-            set chrs
-            bytes chrom
+        chrs: set
+        chrom: bytes
 
         chrs = self.get_chr_names()
         for chrom in sorted(chrs):
@@ -250,7 +280,8 @@ def __init__ (self, float32_t baseline_value=0 ):
                 self.__data.pop(chrom)
         return True
 
-    cpdef list get_data_by_chr (self, bytes chromosome):
+    @cython.ccall
+    def get_data_by_chr(self, chromosome: bytes) -> list:
         """Return array of counts by chromosome.
 
         The return value is a tuple:
@@ -261,13 +292,15 @@ def __init__ (self, float32_t baseline_value=0 ):
         else:
             return []
 
-    cpdef set get_chr_names (self):
+    @cython.ccall
+    def get_chr_names(self) -> set:
         """Return all the chromosome names stored.
 
         """
         return set(sorted(self.__data.keys()))
 
-    cpdef void reset_baseline (self, float32_t baseline_value):
+    @cython.ccall
+    def reset_baseline(self, baseline_value: cython.float):
         """Reset baseline value to baseline_value.
 
         So any region between self.baseline_value and baseline_value
@@ -279,33 +312,36 @@ def __init__ (self, float32_t baseline_value=0 ):
         self.merge_regions()
         return
 
-    cdef merge_regions (self):
+    @cython.cfunc
+    def merge_regions(self):
         """Merge nearby regions with the same value.
 
         """
-        cdef:
-            int32_t new_pre_pos, pos, i
-            float32_t new_pre_value, value
-            bytes chrom
-            set chrs
+        # new_pre_pos: cython.int
+        pos: cython.int
+        i: cython.int
+        new_pre_value: cython.float
+        value: cython.float
+        chrom: bytes
+        chrs: set
 
         chrs = self.get_chr_names()
         for chrom in sorted(chrs):
-            (p,v) = self.__data[chrom]
+            (p, v) = self.__data[chrom]
             pnext = iter(p).__next__
             vnext = iter(v).__next__
 
             # new arrays
-            new_pos = pyarray('L',[pnext(),])
-            new_value = pyarray('f',[vnext(),])
+            new_pos = pyarray('L', [pnext(),])
+            new_value = pyarray('f', [vnext(),])
 
             newpa = new_pos.append
             newva = new_value.append
 
-            new_pre_pos = new_pos[0]
+            # new_pre_pos = new_pos[0]
             new_pre_value = new_value[0]
 
-            for i in range(1,len(p)):
+            for i in range(1, len(p)):
                 pos = pnext()
                 value = vnext()
                 if value == new_pre_value:
@@ -314,33 +350,36 @@ def __init__ (self, float32_t baseline_value=0 ):
                     # add new region
                     newpa(pos)
                     newva(value)
-                    new_pre_pos = pos
+                    # new_pre_pos = pos
                     new_pre_value = value
-            self.__data[chrom] = [new_pos,new_value]
+            self.__data[chrom] = [new_pos, new_value]
         return True
 
-    cpdef bool filter_score (self, float32_t cutoff=0):
+    @cython.ccall
+    def filter_score(self, cutoff: cython.float = 0) -> bool:
         """Filter using a score cutoff. Any region lower than score
         cutoff will be set to self.baseline_value.
 
         Self will be modified.
         """
-        cdef:
-            int32_t new_pre_pos, pos, i
-            float32_t new_pre_value, value
-            bytes chrom
-            set chrs
+        # new_pre_pos: cython.int
+        pos: cython.int
+        i: cython.int
+        new_pre_value: cython.float
+        value: cython.float
+        chrom: bytes
+        chrs: set
 
         chrs = self.get_chr_names()
         for chrom in sorted(chrs):
-            (p,v) = self.__data[chrom]
+            (p, v) = self.__data[chrom]
             pnext = iter(p).__next__
             vnext = iter(v).__next__
 
             # new arrays
-            new_pos = pyarray('L',[])
-            new_value = pyarray('f',[])
-            new_pre_pos = 0
+            new_pos = pyarray('L', [])
+            new_value = pyarray('f', [])
+            # new_pre_pos = 0
             new_pre_value = 0
 
             for i in range(len(p)):
@@ -360,54 +399,66 @@ def __init__ (self, float32_t baseline_value=0 ):
                     # put it into new arrays
                     new_pos.append(pos)
                     new_value.append(value)
-                new_pre_pos = new_pos[-1]
+                # new_pre_pos = new_pos[-1]
                 new_pre_value = new_value[-1]
-            self.__data[chrom]=[new_pos,new_value]
+            self.__data[chrom] = [new_pos, new_value]
         return True
 
-    cpdef tuple summary (self):
-        """Calculate the sum, total_length, max, min, mean, and std. 
+    @cython.ccall
+    def summary(self) -> tuple:
+        """Calculate the sum, total_length, max, min, mean, and std.
 
         Return a tuple for (sum, total_length, max, min, mean, std).
+
         """
-        cdef:
-            int64_tn_v
-            float32_t sum_v, max_v, min_v, mean_v, variance, tmp, std_v
-            int32_t pre_p, l, i
+        n_v: cython.long
+        sum_v: cython.float
+        max_v: cython.float
+        min_v: cython.float
+        mean_v: cython.float
+        variance: cython.float
+        tmp: cython.float
+        std_v: cython.float
+        pre_p: cython.int
+        ln: cython.int
+        i: cython.int
 
         pre_p = 0
         n_v = 0
         sum_v = 0
         max_v = -100000
         min_v = 100000
-        for (p,v) in self.__data.values():
+        for (p, v) in self.__data.values():
             # for each chromosome
             pre_p = 0
             for i in range(len(p)):
                 # for each region
-                l = p[i]-pre_p
-                sum_v += v[i]*l
-                n_v += l
+                ln = p[i]-pre_p
+                sum_v += v[i]*ln
+                n_v += ln
                 pre_p = p[i]
-            max_v = max(max(v),max_v)
-            min_v = min(min(v),min_v)
+            max_v = max(max(v), max_v)
+            min_v = min(min(v), min_v)
         mean_v = sum_v/n_v
         variance = 0.0
-        for (p,v) in self.__data.values():
+        for (p, v) in self.__data.values():
             for i in range(len(p)):
                 # for each region
                 tmp = v[i]-mean_v
-                l = p[i]-pre_p
-                variance += tmp*tmp*l
+                ln = p[i]-pre_p
+                variance += tmp*tmp*ln
                 pre_p = p[i]
 
         variance /= float(n_v-1)
         std_v = sqrt(variance)
         return (sum_v, n_v, max_v, min_v, mean_v, std_v)
 
-    cpdef object call_peaks (self, float32_t cutoff=1,
-                             int32_t min_length=200, int32_t max_gap=50,
-                             bool call_summits=False):
+    @cython.ccall
+    def call_peaks(self,
+                   cutoff: cython.float = 1,
+                   min_length: cython.int = 200,
+                   max_gap: cython.int = 50,
+                   call_summits: bool = False):
         """This function try to find regions within which, scores
         are continuously higher than a given cutoff.
 
@@ -430,19 +481,21 @@ def __init__ (self, float32_t baseline_value=0 ):
         included as `gap` .
 
         """
-        cdef:
-            int32_t peak_length, x, pre_p, p, i, summit, tstart, tend
-            float32_t v, summit_value, tvalue
-            bytes chrom
-            set chrs
-            object peaks
+        # peak_length: cython.int
+        x: cython.int
+        pre_p: cython.int
+        p: cython.int
+        i: cython.int
+        v: cython.float
+        chrom: bytes
+        chrs: set
 
         chrs = self.get_chr_names()
         peaks = PeakIO()                      # dictionary to save peaks
         for chrom in sorted(chrs):
             peak_content = None
-            peak_length = 0
-            (ps,vs) = self.get_data_by_chr(chrom) # arrays for position and values
+            # peak_length = 0
+            (ps, vs) = self.get_data_by_chr(chrom)  # arrays for position and values
             psn = iter(ps).__next__         # assign the next function to a viable to speed up
             vsn = iter(vs).__next__
             x = 0
@@ -452,72 +505,90 @@ def __init__ (self, float32_t baseline_value=0 ):
                 try:                    # try to read the first data range for this chrom
                     p = psn()
                     v = vsn()
-                except:
+                except Exception:
                     break
                 x += 1                  # index for the next point
                 if v >= cutoff:
-                    peak_content = [(pre_p,p,v),]
+                    peak_content = [(pre_p, p, v),]
                     pre_p = p
                     break               # found the first range above cutoff
                 else:
                     pre_p = p
 
-            for i in range(x,len(ps)):
+            for i in range(x, len(ps)):
                 # continue scan the rest regions
                 p = psn()
                 v = vsn()
-                if v < cutoff: # not be detected as 'peak'
+                if v < cutoff:  # not be detected as 'peak'
                     pre_p = p
                     continue
                 # for points above cutoff
                 # if the gap is allowed
                 if pre_p - peak_content[-1][1] <= max_gap:
-                    peak_content.append((pre_p,p,v))
+                    peak_content.append((pre_p, p, v))
                 else:
                     # when the gap is not allowed, close this peak
-                    self.__close_peak(peak_content, peaks, min_length, chrom) #, smoothlen=max_gap / 2 )
+                    self.__close_peak(peak_content,
+                                      peaks,
+                                      min_length,
+                                      chrom)  # , smoothlen=max_gap / 2)
                     # start a new peak
-                    peak_content = [(pre_p,p,v),]
+                    peak_content = [(pre_p, p, v),]
                 pre_p = p
 
             # save the last peak
             if not peak_content:
                 continue
-            self.__close_peak(peak_content, peaks, min_length, chrom) #, smoothlen=max_gap / 2 )
+            self.__close_peak(peak_content,
+                              peaks,
+                              min_length,
+                              chrom)  # , smoothlen=max_gap / 2)
         return peaks
 
-    cdef bool __close_peak( self, list peak_content, object peaks, int32_t min_length, bytes chrom ):
-        cdef:
-            list tsummit        # list for temporary summits
-            int32_t peak_length, summit, tstart, tend
-            float32_t summit_value, tvalue
-            
+    @cython.cfunc
+    def __close_peak(self,
+                     peak_content: list,
+                     peaks,
+                     min_length: cython.int,
+                     chrom: bytes) -> bool:
+        tsummit: list           # list for temporary summits
+        peak_length: cython.int
+        summit: cython.int
+        tstart: cython.int
+        tend: cython.int
+        summit_value: cython.float
+        tvalue: cython.float
         peak_length = peak_content[-1][1]-peak_content[0][0]
-        if peak_length >= min_length: # if the peak is too small, reject it
+        if peak_length >= min_length:  # if the peak is too small, reject it
             tsummit = []
             summit = 0
             summit_value = 0
-            for (tstart,tend,tvalue) in peak_content:
+            for (tstart, tend, tvalue) in peak_content:
                 if not summit_value or summit_value < tvalue:
-                    tsummit = [<int32_t>((tend+tstart)/2),]
+                    tsummit = [cython.cast(cython.int, (tend+tstart)/2),]
                     summit_value = tvalue
                 elif summit_value == tvalue:
-                    tsummit.append( <int32_t>((tend+tstart)/2) )
-            summit = tsummit[<int32_t>((len(tsummit)+1)/2)-1 ]
-            peaks.add( chrom,
-                       peak_content[0][0],
-                       peak_content[-1][1],
-                       summit      = summit,
-                       peak_score  = summit_value,
-                       pileup      = 0,
-                       pscore      = 0,
-                       fold_change = 0,
-                       qscore      = 0
-                       )
+                    tsummit.append(cython.cast(cython.int, (tend+tstart)/2))
+            summit = tsummit[cython.cast(cython.int, (len(tsummit)+1)/2)-1]
+            peaks.add(chrom,
+                      peak_content[0][0],
+                      peak_content[-1][1],
+                      summit=summit,
+                      peak_score=summit_value,
+                      pileup=0,
+                      pscore=0,
+                      fold_change=0,
+                      qscore=0
+                      )
             return True
 
-    cpdef object call_broadpeaks (self, float32_t lvl1_cutoff=500, float32_t lvl2_cutoff=100,
-                                  int32_t min_length=200, int32_t lvl1_max_gap=50, int32_t lvl2_max_gap=400):
+    @cython.ccall
+    def call_broadpeaks(self,
+                        lvl1_cutoff: cython.float = 500,
+                        lvl2_cutoff: cython.float = 100,
+                        min_length: cython.int = 200,
+                        lvl1_max_gap: cython.int = 50,
+                        lvl2_max_gap: cython.int = 400):
         """This function try to find enriched regions within which,
         scores are continuously higher than a given cutoff for level
         1, and link them using the gap above level 2 cutoff with a
@@ -533,17 +604,25 @@ def __init__ (self, float32_t baseline_value=0 ):
         Return both general PeakIO object for highly enriched regions
         and gapped broad regions in BroadPeakIO.
         """
-        cdef:
-            bytes chrom
-            int32_t i, j
-            set chrs
-            object lvl1, lvl2   # PeakContent class object
-            list temppeakset, lvl1peakschrom, lvl2peakschrom
-            
+        chrom: bytes
+        i: cython.int
+        j: cython.int
+        chrs: set
+        lvl1: PeakContent
+        lvl2: PeakContent   # PeakContent class object
+        lvl1peakschrom: list
+        lvl2peakschrom: list
+
         assert lvl1_cutoff > lvl2_cutoff, "level 1 cutoff should be larger than level 2."
         assert lvl1_max_gap < lvl2_max_gap, "level 2 maximum gap should be larger than level 1."
-        lvl1_peaks = self.call_peaks( cutoff=lvl1_cutoff, min_length=min_length, max_gap=lvl1_max_gap, call_summits=False )
-        lvl2_peaks = self.call_peaks( cutoff=lvl2_cutoff, min_length=min_length, max_gap=lvl2_max_gap, call_summits=False )
+        lvl1_peaks = self.call_peaks(cutoff=lvl1_cutoff,
+                                     min_length=min_length,
+                                     max_gap=lvl1_max_gap,
+                                     call_summits=False)
+        lvl2_peaks = self.call_peaks(cutoff=lvl2_cutoff,
+                                     min_length=min_length,
+                                     max_gap=lvl2_max_gap,
+                                     call_summits=False)
         chrs = lvl1_peaks.get_chr_names()
         broadpeaks = BroadPeakIO()
         # use lvl2_peaks as linking regions between lvl1_peaks
@@ -555,51 +634,73 @@ def __init__ (self, float32_t baseline_value=0 ):
             # our assumption is lvl1 regions should be included in lvl2 regions
             try:
                 lvl1 = lvl1peakschrom_next()
-                for i in range( len(lvl2peakschrom) ):
+                for i in range(len(lvl2peakschrom)):
                     # for each lvl2 peak, find all lvl1 peaks inside
                     lvl2 = lvl2peakschrom[i]
                     while True:
-                        if lvl2["start"] <= lvl1["start"]  and lvl1["end"] <= lvl2["end"]:
+                        if lvl2["start"] <= lvl1["start"] and lvl1["end"] <= lvl2["end"]:
                             tmppeakset.append(lvl1)
                             lvl1 = lvl1peakschrom_next()
                         else:
-                            self.__add_broadpeak ( broadpeaks, chrom, lvl2, tmppeakset)
+                            self.__add_broadpeak(broadpeaks,
+                                                 chrom,
+                                                 lvl2,
+                                                 tmppeakset)
                             tmppeakset = []
                             break
             except StopIteration:
-                self.__add_broadpeak ( broadpeaks, chrom, lvl2, tmppeakset)
+                self.__add_broadpeak(broadpeaks, chrom, lvl2, tmppeakset)
                 tmppeakset = []
-                for j in range( i+1, len(lvl2peakschrom) ):
-                    self.__add_broadpeak ( broadpeaks, chrom, lvl2peakschrom[j], tmppeakset)
+                for j in range(i+1, len(lvl2peakschrom)):
+                    self.__add_broadpeak(broadpeaks,
+                                         chrom,
+                                         lvl2peakschrom[j],
+                                         tmppeakset)
         return broadpeaks
 
-    cdef object __add_broadpeak (self, object bpeaks, bytes chrom, object lvl2peak, list lvl1peakset):
+    @cython.cfunc
+    def __add_broadpeak(self,
+                        bpeaks,
+                        chrom: bytes,
+                        lvl2peak: PeakContent,
+                        lvl1peakset: list):
         """Internal function to create broad peak.
-
         """
-        cdef:
-            int32_t start, end, blockNum
-            bytes blockSizes, blockStarts, thickStart, thickEnd
-
-        start      = lvl2peak["start"]
-        end        = lvl2peak["end"]
-
-        # the following code will add those broad/lvl2 peaks with no strong/lvl1 peaks inside
+        start: cython.int
+        end: cython.int
+        blockNum: cython.int
+        blockSizes: bytes
+        blockStarts: bytes
+        thickStart: bytes
+        thickEnd: bytes
+
+        start = lvl2peak["start"]
+        end = lvl2peak["end"]
+
+        # the following code will add those broad/lvl2 peaks with no
+        # strong/lvl1 peaks inside
         if not lvl1peakset:
             # try:
             # will complement by adding 1bps start and end to this region
             # may change in the future if gappedPeak format was improved.
-            bpeaks.add(chrom, start, end, score=lvl2peak["score"], thickStart=(b"%d" % start), thickEnd=(b"%d" % end),
-                       blockNum = 2, blockSizes = b"1,1", blockStarts = (b"0,%d" % (end-start-1)), pileup = lvl2peak["pileup"],
-                       pscore = lvl2peak["pscore"], fold_change = lvl2peak["fc"],
-                       qscore = lvl2peak["qscore"] )
+            bpeaks.add(chrom, start, end,
+                       score=lvl2peak["score"],
+                       thickStart=(b"%d" % start),
+                       thickEnd=(b"%d" % end),
+                       blockNum=2,
+                       blockSizes=b"1,1",
+                       blockStarts=(b"0,%d" % (end-start-1)),
+                       pileup=lvl2peak["pileup"],
+                       pscore=lvl2peak["pscore"],
+                       fold_change=lvl2peak["fc"],
+                       qscore=lvl2peak["qscore"])
             return bpeaks
 
         thickStart = b"%d" % lvl1peakset[0]["start"]
-        thickEnd   = b"%d" % lvl1peakset[-1]["end"]
-        blockNum   = len(lvl1peakset)
-        blockSizes = b",".join( [b"%d" % x["length"] for x in lvl1peakset] )
-        blockStarts = b",".join( [b"%d" % (x["start"]-start) for x in lvl1peakset] )
+        thickEnd = b"%d" % lvl1peakset[-1]["end"]
+        blockNum = len(lvl1peakset)
+        blockSizes = b",".join([b"%d" % x["length"] for x in lvl1peakset])
+        blockStarts = b",".join([b"%d" % (x["start"]-start) for x in lvl1peakset])
 
         if int(thickStart) != start:
             # add 1bp left block
@@ -614,62 +715,72 @@ def __init__ (self, float32_t baseline_value=0 ):
             blockSizes = blockSizes+b",1"
             blockStarts = blockStarts + b"," + (b"%d" % (end-start-1))
 
-        bpeaks.add(chrom, start, end, score=lvl2peak["score"], thickStart=thickStart, thickEnd=thickEnd,
-                   blockNum = blockNum, blockSizes = blockSizes, blockStarts = blockStarts,  pileup = lvl2peak["pileup"],
-                   pscore = lvl2peak["pscore"], fold_change = lvl2peak["fc"],
-                   qscore = lvl2peak["qscore"] )
+        bpeaks.add(chrom, start, end,
+                   score=lvl2peak["score"],
+                   thickStart=thickStart,
+                   thickEnd=thickEnd,
+                   blockNum=blockNum,
+                   blockSizes=blockSizes,
+                   blockStarts=blockStarts,
+                   pileup=lvl2peak["pileup"],
+                   pscore=lvl2peak["pscore"],
+                   fold_change=lvl2peak["fc"],
+                   qscore=lvl2peak["qscore"])
         return bpeaks
 
-    cpdef object refine_peaks (self, object peaks):
+    @cython.ccall
+    def refine_peaks(self, peaks):
         """This function try to based on given peaks, re-evaluate the
         peak region, call the summit.
 
         peaks: PeakIO object
-        
         return: a new PeakIO object
 
         """
-        cdef:
-            int32_t peak_length, x, pre_p, p, i, peak_s, peak_e
-            float32_t v
-            bytes chrom
-            set chrs
-            object new_peaks
+        pre_p: cython.int
+        p: cython.int
+        peak_s: cython.int
+        peak_e: cython.int
+        v: cython.float
+        chrom: bytes
+        chrs: set
 
         peaks.sort()
         new_peaks = PeakIO()
         chrs = self.get_chr_names()
         assert isinstance(peaks, PeakIO)
         chrs = chrs.intersection(set(peaks.get_chr_names()))
-        
+
         for chrom in sorted(chrs):
             peaks_chr = peaks.get_data_from_chrom(chrom)
             peak_content = []
-            (ps,vs) = self.get_data_by_chr(chrom) # arrays for position and values
-            psn = iter(ps).__next__         # assign the next function to a viable to speed up
+            # arrays for position and values
+            (ps, vs) = self.get_data_by_chr(chrom)
+            # assign the next function to a viable to speed up
+            psn = iter(ps).__next__
             vsn = iter(vs).__next__
             peakn = iter(peaks_chr).__next__
 
-            pre_p = 0                   # remember previous position in bedgraph/self
+            # remember previous position in bedgraph/self
+            pre_p = 0
             p = psn()
             v = vsn()
             peak = peakn()
             peak_s = peak["start"]
             peak_e = peak["end"]
-            
             while True:
                 # look for overlap
                 if p > peak_s and peak_e > pre_p:
                     # now put four coordinates together and pick the middle two
                     s, e = sorted([p, peak_s, peak_e, pre_p])[1:3]
                     # add this content
-                    peak_content.append( (s, e, v) )
+                    peak_content.append((s, e, v))
                     # move self/bedGraph
                     try:
                         pre_p = p
                         p = psn()
                         v = vsn()
-                    except:
+                    except Exception:
                         # no more value chunk in bedGraph
                         break
                 elif pre_p >= peak_e:
@@ -681,7 +792,7 @@ def __init__ (self, float32_t baseline_value=0 ):
                         peak = peakn()
                         peak_s = peak["start"]
                         peak_e = peak["end"]
-                    except:
+                    except Exception:
                         # no more peak
                         break
                 elif peak_s >= p:
@@ -690,7 +801,7 @@ def __init__ (self, float32_t baseline_value=0 ):
                         pre_p = p
                         p = psn()
                         v = vsn()
-                    except:
+                    except Exception:
                         # no more value chunk in bedGraph
                         break
                 else:
@@ -701,39 +812,39 @@ def __init__ (self, float32_t baseline_value=0 ):
                 self.__close_peak(peak_content, new_peaks, 0, chrom)
         return new_peaks
 
-    
-    cpdef int32_t total (self):
+    @cython.ccall
+    def total(self) -> cython.int:
         """Return the number of regions in this object.
 
         """
-        cdef:
-            int32_t t
+        t: cython.int
         t = 0
-        for ( p, v ) in self.__data.values():
+        for (p, v) in self.__data.values():
             t += len(p)
         return t
 
-    cpdef object set_single_value (self, float32_t new_value):
+    @cython.ccall
+    def set_single_value(self, new_value: cython.float):
         """Change all the values in bedGraph to the same new_value,
         return a new bedGraphTrackI.
 
         """
-        cdef:
-            bytes chrom
-            int32_t max_p
-            object ret
+        chrom: bytes
+        max_p: cython.int
 
         ret = bedGraphTrackI()
         chroms = set(self.get_chr_names())
         for chrom in sorted(chroms):
-            (p1,v1) = self.get_data_by_chr(chrom) # arrays for position and values
+            # arrays for position and values
+            (p1, v1) = self.get_data_by_chr(chrom)
             # maximum p
             max_p = max(p1)
             # add a region from 0 to max_p
-            ret.add_loc(chrom,0,max_p,new_value)
+            ret.add_loc(chrom, 0, max_p, new_value)
         return ret
 
-    cpdef object overlie (self, object bdgTracks, str func="max" ):
+    @cython.ccall
+    def overlie(self, bdgTracks, func: str = "max"):
         """Calculate two or more bedGraphTrackI objects by letting self
         overlying bdgTrack2, with user-defined functions.
 
@@ -769,10 +880,8 @@ def __init__ (self, float32_t baseline_value=0 ):
 
         Option: bdgTracks can be a list of bedGraphTrackI objects
         """
-        cdef:
-            int32_t pre_p, p1, p2
-            float32_t v1, v2
-            bytes chrom
+        pre_p: cython.int
+        chrom: bytes
 
         nr_tracks = len(bdgTracks) + 1  # +1 for self
         assert nr_tracks >= 2, "Specify at least one more bdg objects."
@@ -803,7 +912,6 @@ def __init__ (self, float32_t baseline_value=0 ):
             raise Exception("Invalid function {func}! Choose from 'sum', 'subtract' (only for two bdg objects), 'product', 'divide' (only for two bdg objects), 'max', 'mean' and 'fisher'. ")
 
         ret = bedGraphTrackI()
-        retadd = ret.add_loc
 
         common_chr = set(self.get_chr_names())
         for track in bdgTracks:
@@ -844,69 +952,79 @@ def __init__ (self, float32_t baseline_value=0 ):
                 pass
         return ret
 
-    cpdef bool apply_func ( self, func ):
+    @cython.ccall
+    def apply_func(self, func) -> bool:
         """Apply function 'func' to every value in this bedGraphTrackI object.
 
         *Two adjacent regions with same value after applying func will
         not be merged.
         """
-        cdef int32_t i
+        i: cython.int
 
-        for (p,s) in self.__data.values():
+        for (p, s) in self.__data.values():
             for i in range(len(s)):
                 s[i] = func(s[i])
         self.maxvalue = func(self.maxvalue)
         self.minvalue = func(self.minvalue)
         return True
 
-    cpdef p2q ( self ):
+    @cython.ccall
+    def p2q(self):
         """Convert pvalue scores to qvalue scores.
 
         *Assume scores in this bedGraph are pvalue scores! Not work
          for other type of scores.
         """
-        cdef:
-            bytes chrom
-            object pos_array, pscore_array
-            dict pvalue_stat = {}
-            dict pqtable = {}
-            int64_t n, pre_p, this_p, length, j, pre_l, l, i
-            float32_t this_v, pre_v, v, q, pre_q, this_t, this_c
-            int64_t N, k, this_l
-            float32_t f
-            int64_t nhcal = 0
-            int64_t npcal = 0
-            list unique_values
-            float32_t t0, t1, t
+        chrom: bytes
+        pos_array: pyarray
+        pscore_array: pyarray
+        pvalue_stat: dict = {}
+        pqtable: dict = {}
+        pre_p: cython.long
+        this_p: cython.long
+        # pre_l: cython.long
+        # l: cython.long
+        i: cython.long
+        nhcal: cython.long = 0
+        N: cython.long
+        k: cython.long
+        this_l: cython.long
+        this_v: cython.float
+        # pre_v: cython.float
+        v: cython.float
+        q: cython.float
+        pre_q: cython.float
+        f: cython.float
+        unique_values: list
 
         # calculate frequencies of each p-score
         for chrom in sorted(self.get_chr_names()):
             pre_p = 0
 
-            [pos_array, pscore_array] = self.__data[ chrom ]
+            [pos_array, pscore_array] = self.__data[chrom]
 
             pn = iter(pos_array).__next__
             vn = iter(pscore_array).__next__
 
-            for i in range( len( pos_array ) ):
+            for i in range(len(pos_array)):
                 this_p = pn()
                 this_v = vn()
                 this_l = this_p - pre_p
                 if this_v in pvalue_stat:
-                    pvalue_stat[ this_v ] += this_l
+                    pvalue_stat[this_v] += this_l
                 else:
-                    pvalue_stat[ this_v ] = this_l
+                    pvalue_stat[this_v] = this_l
                 pre_p = this_p
 
-            nhcal += len( pos_array )
+            # nhcal += len(pos_array)
 
-        nhval = 0
+        # nhval = 0
 
-        N = sum(pvalue_stat.values()) # total length
-        k = 1                           # rank
+        N = sum(pvalue_stat.values())  # total length
+        k = 1                          # rank
         f = -log10(N)
-        pre_v = -2147483647
-        pre_l = 0
+        # pre_v = -2147483647
+        # pre_l = 0
         pre_q = 2147483647              # save the previous q-value
 
         # calculate qscore for each pscore
@@ -914,40 +1032,43 @@ def __init__ (self, float32_t baseline_value=0 ):
         unique_values = sorted(pvalue_stat.keys(), reverse=True)
         for i in range(len(unique_values)):
             v = unique_values[i]
-            l = pvalue_stat[v]
+            # l = pvalue_stat[v]
             q = v + (log10(k) + f)
-            q = max(0,min(pre_q,q))           # make q-score monotonic
-            pqtable[ v ] = q
-            pre_v = v
+            q = max(0, min(pre_q, q))           # make q-score monotonic
+            pqtable[v] = q
+            # pre_v = v
             pre_q = q
-            k+=l
+            # k += l
             nhcal += 1
 
         # convert pscore to qscore
         for chrom in sorted(self.get_chr_names()):
-            [pos_array, pscore_array] = self.__data[ chrom ]
+            [pos_array, pscore_array] = self.__data[chrom]
 
-            for i in range( len( pos_array ) ):
-                pscore_array[ i ] = pqtable[ pscore_array[ i ] ]
+            for i in range(len(pos_array)):
+                pscore_array[i] = pqtable[pscore_array[i]]
 
         self.merge_regions()
         return
 
-
-    cpdef object extract_value ( self, object bdgTrack2 ):
+    @cython.ccall
+    def extract_value(self, bdgTrack2):
         """Extract values from regions defined in bedGraphTrackI class object
         `bdgTrack2`.
 
         """
-        cdef:
-            int32_t pre_p, p1, p2, i
-            float32_t v1, v2
-            bytes chrom
-            object ret
-
-        assert isinstance(bdgTrack2,bedGraphTrackI), "not a bedGraphTrackI object"
-
-        ret = [ [], pyarray('f',[]), pyarray('L',[]) ] # 1: region in bdgTrack2; 2: value; 3: length with the value
+        pre_p: cython.int
+        p1: cython.int
+        p2: cython.int
+        i: cython.int
+        v1: cython.float
+        v2: cython.float
+        chrom: bytes
+
+        assert isinstance(bdgTrack2, bedGraphTrackI), "not a bedGraphTrackI object"
+
+        # 1: region in bdgTrack2; 2: value; 3: length with the value
+        ret = [[], pyarray('f', []), pyarray('L', [])]
         radd = ret[0].append
         vadd = ret[1].append
         ladd = ret[2].append
@@ -955,16 +1076,21 @@ def __init__ (self, float32_t baseline_value=0 ):
         chr1 = set(self.get_chr_names())
         chr2 = set(bdgTrack2.get_chr_names())
         common_chr = chr1.intersection(chr2)
-        for i in range( len( common_chr ) ):
+        for i in range(len(common_chr)):
             chrom = common_chr.pop()
-            (p1s,v1s) = self.get_data_by_chr(chrom) # arrays for position and values
-            p1n = iter(p1s).__next__         # assign the next function to a viable to speed up
+            (p1s, v1s) = self.get_data_by_chr(chrom)  # arrays for position and values
+            # assign the next function to a viable to speed up
+            p1n = iter(p1s).__next__
             v1n = iter(v1s).__next__
 
-            (p2s,v2s) = bdgTrack2.get_data_by_chr(chrom) # arrays for position and values
-            p2n = iter(p2s).__next__         # assign the next function to a viable to speed up
+            # arrays for position and values
+            (p2s, v2s) = bdgTrack2.get_data_by_chr(chrom)
+            # assign the next function to a viable to speed up
+            p2n = iter(p2s).__next__
             v2n = iter(v2s).__next__
-            pre_p = 0                   # remember the previous position in the new bedGraphTrackI object ret
+            # remember the previous position in the new bedGraphTrackI
+            # object ret
+            pre_p = 0
             try:
                 p1 = p1n()
                 v1 = v1n()
@@ -975,7 +1101,7 @@ def __init__ (self, float32_t baseline_value=0 ):
                 while True:
                     if p1 < p2:
                         # clip a region from pre_p to p1, then set pre_p as p1.
-                        if v2>0:
+                        if v2 > 0:
                             radd(str(chrom)+"."+str(pre_p)+"."+str(p1))
                             vadd(v1)
                             ladd(p1-pre_p)
@@ -984,8 +1110,9 @@ def __init__ (self, float32_t baseline_value=0 ):
                         p1 = p1n()
                         v1 = v1n()
                     elif p2 < p1:
-                        # clip a region from pre_p to p2, then set pre_p as p2.
-                        if v2>0:
+                        # clip a region from pre_p to p2, then set
+                        # pre_p as p2.
+                        if v2 > 0:
                             radd(str(chrom)+"."+str(pre_p)+"."+str(p2))
                             vadd(v1)
                             ladd(p2-pre_p)
@@ -995,7 +1122,7 @@ def __init__ (self, float32_t baseline_value=0 ):
                         v2 = v2n()
                     elif p1 == p2:
                         # from pre_p to p1 or p2, then set pre_p as p1 or p2.
-                        if v2>0:
+                        if v2 > 0:
                             radd(str(chrom)+"."+str(pre_p)+"."+str(p1))
                             vadd(v1)
                             ladd(p1-pre_p)
@@ -1011,7 +1138,8 @@ def __init__ (self, float32_t baseline_value=0 ):
 
         return ret
 
-    cpdef object extract_value_hmmr ( self, object bdgTrack2 ):
+    @cython.ccall
+    def extract_value_hmmr(self, bdgTrack2):
         """Extract values from regions defined in bedGraphTrackI class object
         `bdgTrack2`.
 
@@ -1023,15 +1151,19 @@ def __init__ (self, float32_t baseline_value=0 ):
         'mark_bin' -- the bins in the same region will have the same
         value.
         """
-        cdef:
-             int32_t pre_p, p1, p2, i
-             float32_t v1, v2
-             bytes chrom
-             list ret
-
-        assert isinstance(bdgTrack2,bedGraphTrackI), "not a bedGraphTrackI object"
-
-        ret = [ [], pyarray('f',[]), pyarray('i',[]) ] # 0: bin location (chrom, position); 1: value; 2: number of bins in this region
+        # pre_p: cython.int
+        p1: cython.int
+        p2: cython.int
+        i: cython.int
+        v1: cython.float
+        v2: cython.float
+        chrom: bytes
+        ret: list
+
+        assert isinstance(bdgTrack2, bedGraphTrackI), "not a bedGraphTrackI object"
+
+        # 0: bin location (chrom, position); 1: value; 2: number of bins in this region
+        ret = [[], pyarray('f', []), pyarray('i', [])]
         padd = ret[0].append
         vadd = ret[1].append
         ladd = ret[2].append
@@ -1039,16 +1171,22 @@ def __init__ (self, float32_t baseline_value=0 ):
         chr1 = set(self.get_chr_names())
         chr2 = set(bdgTrack2.get_chr_names())
         common_chr = sorted(list(chr1.intersection(chr2)))
-        for i in range( len( common_chr ) ):
+        for i in range(len(common_chr)):
             chrom = common_chr.pop()
-            (p1s,v1s) = self.get_data_by_chr(chrom) # arrays for position and values
-            p1n = iter(p1s).__next__         # assign the next function to a viable to speed up
+            # arrays for position and values
+            (p1s, v1s) = self.get_data_by_chr(chrom)
+            # assign the next function to a viable to speed up
+            p1n = iter(p1s).__next__
             v1n = iter(v1s).__next__
 
-            (p2s,v2s) = bdgTrack2.get_data_by_chr(chrom) # arrays for position and values
-            p2n = iter(p2s).__next__         # assign the next function to a viable to speed up
+            # arrays for position and values
+            (p2s, v2s) = bdgTrack2.get_data_by_chr(chrom)
+            # assign the next function to a viable to speed up
+            p2n = iter(p2s).__next__
             v2n = iter(v2s).__next__
-            pre_p = 0                   # remember the previous position in the new bedGraphTrackI object ret
+            # remember the previous position in the new bedGraphTrackI
+            # object ret
+            # pre_p = 0
             try:
                 p1 = p1n()
                 v1 = v1n()
@@ -1060,31 +1198,31 @@ def __init__ (self, float32_t baseline_value=0 ):
                     if p1 < p2:
                         # clip a region from pre_p to p1, then set pre_p as p1.
                         # in this case, we don't output any
-                        #if v2>0:
+                        # if v2>0:
                         #    radd(str(chrom)+"."+str(pre_p)+"."+str(p1))
                         #    vadd(v1)
                         #    ladd(p1-pre_p)
-                        pre_p = p1
+                        # pre_p = p1
                         # call for the next p1 and v1
                         p1 = p1n()
                         v1 = v1n()
                     elif p2 < p1:
                         # clip a region from pre_p to p2, then set pre_p as p2.
-                        if v2 != 0: #0 means it's a gap region, we should have value > 1
-                            padd( (chrom, p2) )
+                        if v2 != 0:  # 0 means it's a gap region, we should have value > 1
+                            padd((chrom, p2))
                             vadd(v1)
                             ladd(int(v2))
-                        pre_p = p2
+                        # pre_p = p2
                         # call for the next p2 and v2
                         p2 = p2n()
                         v2 = v2n()
                     elif p1 == p2:
                         # from pre_p to p1 or p2, then set pre_p as p1 or p2.
-                        if v2 != 0: #0 means it's a gap region, we should have 1 or -1
-                            padd( (chrom, p2) )
+                        if v2 != 0: # 0 means it's a gap region, we should have 1 or -1
+                            padd((chrom, p2))
                             vadd(v1)
                             ladd(int(v2))
-                        pre_p = p1
+                        # pre_p = p1
                         # call for the next p1, v1, p2, v2.
                         p1 = p1n()
                         v1 = v1n()
@@ -1096,7 +1234,10 @@ def __init__ (self, float32_t baseline_value=0 ):
 
         return ret
 
-    cpdef make_ScoreTrackII_for_macs (self, object bdgTrack2, float32_t depth1 = 1.0, float32_t depth2 = 1.0 ):
+    @cython.ccall
+    def make_ScoreTrackII_for_macs(self, bdgTrack2,
+                                   depth1: float = 1.0,
+                                   depth2: float = 1.0):
         """A modified overlie function for MACS v2.
 
         effective_depth_in_million: sequencing depth in million after
@@ -1108,35 +1249,43 @@ def __init__ (self, float32_t baseline_value=0 ):
 
         Return value is a ScoreTrackII object.
         """
-        cdef:
-            int32_t pre_p, p1, p2
-            float32_t v1, v2
-            bytes chrom
-            object ret
+        # pre_p: cython.int
+        p1: cython.int
+        p2: cython.int
+        v1: cython.float
+        v2: cython.float
+        chrom: bytes
 
-        assert isinstance(bdgTrack2,bedGraphTrackI), "bdgTrack2 is not a bedGraphTrackI object"
+        assert isinstance(bdgTrack2, bedGraphTrackI), "bdgTrack2 is not a bedGraphTrackI object"
 
-        ret = ScoreTrackII( treat_depth = depth1, ctrl_depth = depth2 )
+        ret = ScoreTrackII(treat_depth=depth1,
+                           ctrl_depth=depth2)
         retadd = ret.add
 
         chr1 = set(self.get_chr_names())
         chr2 = set(bdgTrack2.get_chr_names())
         common_chr = chr1.intersection(chr2)
         for chrom in sorted(common_chr):
-
-            (p1s,v1s) = self.get_data_by_chr(chrom) # arrays for position and values
-            p1n = iter(p1s).__next__         # assign the next function to a viable to speed up
+            # arrays for position and values
+            (p1s, v1s) = self.get_data_by_chr(chrom)
+            # assign the next function to a viable to speed up
+            p1n = iter(p1s).__next__
             v1n = iter(v1s).__next__
-
-            (p2s,v2s) = bdgTrack2.get_data_by_chr(chrom) # arrays for position and values
-            p2n = iter(p2s).__next__         # assign the next function to a viable to speed up
+            # arrays for position and values
+            (p2s, v2s) = bdgTrack2.get_data_by_chr(chrom)
+            # assign the next function to a viable to speed up
+            p2n = iter(p2s).__next__
             v2n = iter(v2s).__next__
 
-            chrom_max_len = len(p1s)+len(p2s) # this is the maximum number of locations needed to be recorded in scoreTrackI for this chromosome.
+            # this is the maximum number of locations needed to be
+            # recorded in scoreTrackI for this chromosome.
+            chrom_max_len = len(p1s)+len(p2s)
 
-            ret.add_chromosome(chrom,chrom_max_len)
+            ret.add_chromosome(chrom, chrom_max_len)
 
-            pre_p = 0                   # remember the previous position in the new bedGraphTrackI object ret
+            # remember the previous position in the new bedGraphTrackI
+            # object ret
+            # pre_p = 0
 
             try:
                 p1 = p1n()
@@ -1148,22 +1297,22 @@ def __init__ (self, float32_t baseline_value=0 ):
                 while True:
                     if p1 < p2:
                         # clip a region from pre_p to p1, then set pre_p as p1.
-                        retadd( chrom, p1, v1, v2 )
-                        pre_p = p1
+                        retadd(chrom, p1, v1, v2)
+                        # pre_p = p1
                         # call for the next p1 and v1
                         p1 = p1n()
                         v1 = v1n()
                     elif p2 < p1:
                         # clip a region from pre_p to p2, then set pre_p as p2.
-                        retadd( chrom, p2, v1, v2 )
-                        pre_p = p2
+                        retadd(chrom, p2, v1, v2)
+                        # pre_p = p2
                         # call for the next p2 and v2
                         p2 = p2n()
                         v2 = v2n()
                     elif p1 == p2:
                         # from pre_p to p1 or p2, then set pre_p as p1 or p2.
-                        retadd( chrom, p1, v1, v2 )
-                        pre_p = p1
+                        retadd(chrom, p1, v1, v2)
+                        # pre_p = p1
                         # call for the next p1, v1, p2, v2.
                         p1 = p1n()
                         v1 = v1n()
@@ -1174,13 +1323,19 @@ def __init__ (self, float32_t baseline_value=0 ):
                 pass
 
         ret.finalize()
-        #ret.merge_regions()
+        # ret.merge_regions()
         return ret
 
-    cpdef str cutoff_analysis ( self, int32_t max_gap, int32_t min_length, int32_t steps = 100, float32_t min_score = 0, float32_t max_score = 1000 ):
+    @cython.ccall
+    def cutoff_analysis(self,
+                        max_gap: cython.int,
+                        min_length: cython.int,
+                        steps: cython.int = 100,
+                        min_score: cython.float = 0,
+                        max_score: cython.float = 1000) -> str:
         """
         Cutoff analysis function for bedGraphTrackI object.
-    
+
         This function will try all possible cutoff values on the score
         column to call peaks. Then will give a report of a number of
         metrics (number of peaks, total length of peaks, average
@@ -1196,7 +1351,7 @@ def __init__ (self, float32_t baseline_value=0 ):
 
         max_gap : int32_t
         Maximum allowed gap between consecutive positions above cutoff
-        
+
         min_length : int32_t Minimum length of peak
         steps: int32_t
         It will be used to calculate 'step' to increase from min_v to
@@ -1228,47 +1383,61 @@ def __init__ (self, float32_t baseline_value=0 ):
         can add more ways to analyze the result. Also, we can let this
         function return a list of dictionary or data.frame in that
         way, instead of str object.
-        
         """
-        cdef:
-            set chrs
-            list peak_content, ret_list, cutoff_list, cutoff_npeaks, cutoff_lpeaks
-            bytes  chrom
-            str ret
-            float32_t cutoff
-            int64_t total_l, total_p, i, n, ts, te, lastp, tl, peak_length
-            #dict cutoff_npeaks, cutoff_lpeaks
-            float32_t s, midvalue
+        chrs: set
+        peak_content: list
+        ret_list: list
+        cutoff_list: list
+        cutoff_npeaks: list
+        cutoff_lpeaks: list
+        chrom: bytes
+        ret: str
+        cutoff: cython.float
+        total_l: cython.long
+        total_p: cython.long
+        i: cython.long
+        n: cython.long
+        ts: cython.long
+        te: cython.long
+        lastp: cython.long
+        tl: cython.long
+        peak_length: cython.long
+        # dict cutoff_npeaks, cutoff_lpeaks
+        s: cython.float
 
         chrs = self.get_chr_names()
 
-        #midvalue = self.minvalue/2 + self.maxvalue/2
-        #s = float(self.minvalue - midvalue)/steps
-        minv = max( min_score, self.minvalue )
-        maxv = min( self.maxvalue, max_score )
+        # midvalue = self.minvalue/2 + self.maxvalue/2
+        # s = float(self.minvalue - midvalue)/steps
+        minv = max(min_score, self.minvalue)
+        maxv = min(self.maxvalue, max_score)
 
         s = float(maxv - minv)/steps
 
         # a list of possible cutoff values from minv to maxv with step of s
         cutoff_list = [round(value, 3) for value in np.arange(minv, maxv, s)]
 
-        cutoff_npeaks = [0] * len( cutoff_list )
-        cutoff_lpeaks = [0] * len( cutoff_list )
+        cutoff_npeaks = [0] * len(cutoff_list)
+        cutoff_lpeaks = [0] * len(cutoff_list)
 
         for chrom in sorted(chrs):
-            ( pos_array, score_array ) = self.__data[ chrom ]
-            pos_array = np.array( self.__data[ chrom ][ 0 ] )
-            score_array = np.array( self.__data[ chrom ][ 1 ] )
+            (pos_array, score_array) = self.__data[chrom]
+            pos_array = np.array(self.__data[chrom][0])
+            score_array = np.array(self.__data[chrom][1])
 
-            for n in range( len( cutoff_list ) ):
-                cutoff = cutoff_list[ n ]
+            for n in range(len(cutoff_list)):
+                cutoff = cutoff_list[n]
                 total_l = 0           # total length of peaks
                 total_p = 0           # total number of peaks
 
-                # get the regions with scores above cutoffs
-                above_cutoff = np.nonzero( score_array > cutoff )[0]# this is not an optimized method. It would be better to store score array in a 2-D ndarray?
-                above_cutoff_endpos = pos_array[above_cutoff] # end positions of regions where score is above cutoff
-                above_cutoff_startpos = pos_array[above_cutoff-1] # start positions of regions where score is above cutoff
+                # get the regions with scores above cutoffs. This is
+                # not an optimized method. It would be better to store
+                # score array in a 2-D ndarray?
+                above_cutoff = np.nonzero(score_array > cutoff)[0]
+                # end positions of regions where score is above cutoff
+                above_cutoff_endpos = pos_array[above_cutoff]
+                # start positions of regions where score is above cutoff
+                above_cutoff_startpos = pos_array[above_cutoff-1]
 
                 if above_cutoff_endpos.size == 0:
                     continue
@@ -1279,62 +1448,66 @@ def __init__ (self, float32_t baseline_value=0 ):
 
                 ts = acs_next()
                 te = ace_next()
-                peak_content = [( ts, te ), ]
+                peak_content = [(ts, te),]
                 lastp = te
 
-                for i in range( 1, above_cutoff_startpos.size ):
+                for i in range(1, above_cutoff_startpos.size):
                     ts = acs_next()
                     te = ace_next()
                     tl = ts - lastp
                     if tl <= max_gap:
-                        peak_content.append( ( ts, te ) )
+                        peak_content.append((ts, te))
                     else:
-                        peak_length = peak_content[ -1 ][ 1 ] - peak_content[ 0 ][ 0 ]
-                        if peak_length >= min_length: # if the peak is too small, reject it
-                            total_l +=  peak_length
+                        peak_length = peak_content[-1][1] - peak_content[0][0]
+                        # if the peak is too small, reject it
+                        if peak_length >= min_length:
+                            total_l += peak_length
                             total_p += 1
-                        peak_content = [ ( ts, te ), ]
+                        peak_content = [(ts, te),]
                     lastp = te
 
                 if peak_content:
-                    peak_length = peak_content[ -1 ][ 1 ] - peak_content[ 0 ][ 0 ]
-                    if peak_length >= min_length: # if the peak is too small, reject it
-                        total_l +=  peak_length
+                    peak_length = peak_content[-1][1] - peak_content[0][0]
+                    # if the peak is too small, reject it
+                    if peak_length >= min_length:
+                        total_l += peak_length
                         total_p += 1
-                cutoff_lpeaks[ n ] += total_l
-                cutoff_npeaks[ n ] += total_p
-                
+                cutoff_lpeaks[n] += total_l
+                cutoff_npeaks[n] += total_p
+
         # prepare the returnning text
         ret_list = ["score\tnpeaks\tlpeaks\tavelpeak\n"]
-        for n in range( len( cutoff_list )-1, -1, -1 ):
-            cutoff = cutoff_list[ n ]
-            if cutoff_npeaks[ n ] > 0:
-                ret_list.append("%.2f\t%d\t%d\t%.2f\n" % ( cutoff, cutoff_npeaks[ n ], \
-                                                           cutoff_lpeaks[ n ], \
-                                                           cutoff_lpeaks[ n ]/cutoff_npeaks[ n ] ))
+        for n in range(len(cutoff_list)-1, -1, -1):
+            cutoff = cutoff_list[n]
+            if cutoff_npeaks[n] > 0:
+                ret_list.append("%.2f\t%d\t%d\t%.2f\n" % (cutoff,
+                                                          cutoff_npeaks[n],
+                                                          cutoff_lpeaks[n],
+                                                          cutoff_lpeaks[n]/cutoff_npeaks[n]))
         ret = ''.join(ret_list)
         return ret
 
-cdef np.ndarray calculate_elbows( np.ndarray values, float32_t threshold=0.01):
-    # although this function is supposed to find elbow pts for cutoff analysis, 
-    # however, in reality, it barely works...
-    cdef: 
-        np.ndarray deltas, slopes, delta_slopes, elbows
-        np.float32_t avg_delta_slope
-        
+
+@cython.cfunc
+def calculate_elbows(values: cnp.ndarray,
+                     threshold: cython.float = 0.01) -> cnp.ndarray:
+    # although this function is supposed to find elbow pts for cutoff
+    # analysis, however, in reality, it barely works...
+    deltas: cnp.ndarray
+    slopes: cnp.ndarray
+    delta_slopes: cnp.ndarray
+    elbows: cnp.ndarray
+    avg_delta_slope: cython.float
+
     # Calculate the difference between each point and the first point
     deltas = values - values[0]
-    
     # Calculate the slope between each point and the last point
     slopes = deltas / (values[-1] - values[0])
-    
     # Calculate the change in slope
     delta_slopes = np.diff(slopes)
-    
     # Calculate the average change in slope
     avg_delta_slope = np.mean(delta_slopes)
-    
-    # Find all points where the change in slope is significantly larger than the average
+    # Find all points where the change in slope is significantly
+    # larger than the average
     elbows = np.where(delta_slopes > avg_delta_slope + threshold)[0]
-    
     return elbows
diff --git a/MACS3/Signal/HMMR_Signal_Processing.py b/MACS3/Signal/HMMR_Signal_Processing.py
index b6185663..a6c4a8cb 100644
--- a/MACS3/Signal/HMMR_Signal_Processing.py
+++ b/MACS3/Signal/HMMR_Signal_Processing.py
@@ -1,6 +1,6 @@
 # cython: language_level=3
 # cython: profile=True
-# Time-stamp: <2024-10-04 10:25:29 Tao Liu>
+# Time-stamp: <2024-10-14 17:04:27 Tao Liu>
 
 """Module description:
 
@@ -137,7 +137,7 @@ def generate_digested_signals(petrack, weight_mapping: list) -> list:
         certain_signals = ret_digested_signals[i]
         bdg = bedGraphTrackI()
         for chrom in sorted(certain_signals.keys()):
-            bdg.add_chrom_data_hmmr_PV(chrom, certain_signals[chrom])
+            bdg.add_chrom_data_PV(chrom, certain_signals[chrom])
         ret_bedgraphs.append(bdg)
     return ret_bedgraphs
 
diff --git a/MACS3/Signal/PairedEndTrack.py b/MACS3/Signal/PairedEndTrack.py
index ed056a9b..d8a7a85f 100644
--- a/MACS3/Signal/PairedEndTrack.py
+++ b/MACS3/Signal/PairedEndTrack.py
@@ -1,6 +1,6 @@
 # cython: language_level=3
 # cython: profile=True
-# Time-stamp: <2024-10-14 13:15:32 Tao Liu>
+# Time-stamp: <2024-10-14 21:13:35 Tao Liu>
 
 """Module for filter duplicate tags from paired-end data
 
@@ -24,7 +24,8 @@
                                  over_two_pv_array,
                                  se_all_in_one_pileup)
 from MACS3.Signal.BedGraph import bedGraphTrackI
-from MACS3.Signal.PileupV2 import pileup_from_LR_hmmratac
+from MACS3.Signal.PileupV2 import (pileup_from_LR_hmmratac,
+                                   pileup_from_LRC)
 # ------------------------------------
 # Other modules
 # ------------------------------------
@@ -56,7 +57,7 @@ class PETrackI:
     locations = cython.declare(dict, visibility="public")
     size = cython.declare(dict, visibility="public")
     buf_size = cython.declare(dict, visibility="public")
-    sorted = cython.declare(bool, visibility="public")
+    is_sorted = cython.declare(bool, visibility="public")
     total = cython.declare(cython.ulong, visibility="public")
     annotation = cython.declare(str, visibility="public")
     # rlengths: reference chromosome lengths dictionary
@@ -64,27 +65,28 @@ class PETrackI:
     buffer_size = cython.declare(cython.long, visibility="public")
     length = cython.declare(cython.long, visibility="public")
     average_template_length = cython.declare(cython.float, visibility="public")
-    destroyed: bool
+    is_destroyed: bool
 
     def __init__(self, anno: str = "", buffer_size: cython.long = 100000):
         """fw is the fixed-width for all locations.
 
         """
         # dictionary with chrname as key, nparray with
-        # [('l','int32'),('r','int32')] as value
+        # [('l','i4'),('r','i4')] as value
         self.locations = {}
         # dictionary with chrname as key, size of the above nparray as value
         # size is to remember the size of the fragments added to this chromosome
         self.size = {}
         # dictionary with chrname as key, size of the above nparray as value
         self.buf_size = {}
-        self.sorted = False
+        self.is_sorted = False
         self.total = 0           # total fragments
         self.annotation = anno   # need to be figured out
         self.rlengths = {}
         self.buffer_size = buffer_size
         self.length = 0
         self.average_template_length = 0.0
+        self.is_destroyed = False
 
     @cython.ccall
     def add_loc(self, chromosome: bytes,
@@ -130,7 +132,7 @@ def destroy(self):
                                                   refcheck=False)
                 self.locations[chromosome] = None
                 self.locations.pop(chromosome)
-        self.destroyed = True
+        self.is_destroyed = True
         return
 
     @cython.ccall
@@ -187,7 +189,7 @@ def finalize(self):
             self.locations[c].sort(order=['l', 'r'])
             self.total += self.size[c]
 
-        self.sorted = True
+        self.is_sorted = True
         self.average_template_length = cython.cast(cython.float, self.length) / self.total
         return
 
@@ -219,7 +221,7 @@ def sort(self):
 
         for c in chrnames:
             self.locations[c].sort(order=['l', 'r'])  # sort by the leftmost location
-        self.sorted = True
+        self.is_sorted = True
         return
 
     @cython.ccall
@@ -286,7 +288,7 @@ def filter_dup(self, maxnum: cython.int = -1):
         if maxnum < 0:
             return              # condition to return if not filtering
 
-        if not self.sorted:
+        if not self.is_sorted:
             self.sort()
 
         self.total = 0
@@ -539,7 +541,7 @@ def pileup_a_chromosome_c(self,
         three_shift: cython.long
         rlength: cython.long = self.get_rlengths()[chrom]
 
-        if not self.sorted:
+        if not self.is_sorted:
             self.sort()
 
         assert len(ds) == len(scale_factor_s), "ds and scale_factor_s must have the same length!"
@@ -619,10 +621,12 @@ def pileup_bdg(self,
     def pileup_bdg_hmmr(self,
                         mapping: list,
                         baseline_value: cython.float = 0.0) -> list:
-        """pileup all chromosomes, and return a list of four
-        bedGraphTrackI objects: short, mono, di, and tri nucleosomal
-        signals.
+        """pileup all chromosomes, and return a list of four p-v
+        ndarray objects: short, mono, di, and tri nucleosomal signals.
 
+        This is specifically designed for hmmratac
+        HMM_SignalProcessing.py. Not a general function.
+        
         The idea is that for each fragment length, we generate four
         bdg using four weights from four distributions. Then we add
         all sets of four bdgs together.
@@ -650,40 +654,66 @@ def pileup_bdg_hmmr(self,
 
 
 @cython.cclass
-class PETrackII(PETrackI):
-    """Documentation for PEtrac
+class PETrackII:
+    """Paired-end track class for fragment files from single-cell
+    ATAC-seq experiments. We will store data of start, end, barcode,
+    and count from the fragment files.
+
+    * I choose not to inherit PETrackI because there would be a lot of
+      differences.
 
     """
+    locations = cython.declare(dict, visibility="public")
     # add another dict for storing barcode for each fragment we will
     # first convert barcode into integer and remember them in the
-    # barcode_dict, which will map key:barcode -> value:integer
+    # barcode_dict, which will map the rule to numerize
+    # key:bytes as value:4bytes_integer
     barcodes = cython.declare(dict, visibility="public")
     barcode_dict = cython.declare(dict, visibility="public")
-    # the last number for barcodes, used to map barcode into integer
+    # the last number for barcodes, used to map barcode to integer
     barcode_last_n: cython.int
 
-    def __init__(self):
-        super().__init__()
+    size = cython.declare(dict, visibility="public")
+    buf_size = cython.declare(dict, visibility="public")
+    is_sorted = cython.declare(bool, visibility="public")
+    total = cython.declare(cython.ulong, visibility="public")
+    annotation = cython.declare(str, visibility="public")
+    # rlengths: reference chromosome lengths dictionary
+    rlengths = cython.declare(dict, visibility="public")
+    buffer_size = cython.declare(cython.long, visibility="public")
+    length = cython.declare(cython.long, visibility="public")
+    average_template_length = cython.declare(cython.float, visibility="public")
+    is_destroyed: bool
+
+    def __init__(self, anno: str = "", buffer_size: cython.long = 100000):
+        # dictionary with chrname as key, nparray with
+        # [('l','i4'),('r','i4'),('c','u1')] as value
+        self.locations = {}
+        # dictionary with chrname as key, size of the above nparray as value
+        # size is to remember the size of the fragments added to this chromosome
+        self.size = {}
+        # dictionary with chrname as key, size of the above nparray as value
+        self.buf_size = {}
+        self.is_sorted = False
+        self.total = 0           # total fragments
+        self.annotation = anno   # need to be figured out
+        self.rlengths = {}
+        self.buffer_size = buffer_size
+        self.length = 0
+        self.average_template_length = 0.0
+        self.is_destroyed = False
+
         self.barcodes = {}
         self.barcode_dict = {}
         self.barcode_last_n = 0
 
     @cython.ccall
-    def add_loc(self, chromosome: bytes,
-                start: cython.int, end: cython.int):
-        raise NotImplementedError("This function is disabled PETrackII")
-
-    @cython.ccall
-    def filter_dup(self, maxnum: cython.int = -1):
-        raise NotImplementedError("This function is disabled PETrackII")
-
-    @cython.ccall
-    def add_frag(self,
-                 chromosome: bytes,
-                 start: cython.int,
-                 end: cython.int,
-                 barcode: bytes,
-                 count: cython.uchar):
+    def add_loc(self,
+                chromosome: bytes,
+                start: cython.int,
+                end: cython.int,
+                barcode: bytes,
+                count: cython.uchar):
         """Add a location to the list according to the sequence name.
 
         chromosome: mostly the chromosome name
@@ -747,9 +777,42 @@ def destroy(self):
                 self.barcodes[chromosome] = None
                 self.barcodes.pop(chromosome)
         self.barcode_dict = {}
-        self.destroyed = True
+        self.is_destroyed = True
         return
 
+    @cython.ccall
+    def set_rlengths(self, rlengths: dict) -> bool:
+        """Set reference chromosome lengths dictionary.
+
+        Only the chromosome existing in this petrack object will be updated.
+
+        If a chromosome in this petrack is not covered by given
+        rlengths, and it has no associated length, it will be set as
+        maximum integer.
+        """
+        valid_chroms: set
+        missed_chroms: set
+        chrom: bytes
+
+        valid_chroms = set(self.locations.keys()).intersection(rlengths.keys())
+        for chrom in sorted(valid_chroms):
+            self.rlengths[chrom] = rlengths[chrom]
+        missed_chroms = set(self.locations.keys()).difference(rlengths.keys())
+        for chrom in sorted(missed_chroms):
+            self.rlengths[chrom] = INT_MAX
+        return True
+
+    @cython.ccall
+    def get_rlengths(self) -> dict:
+        """Get reference chromosome lengths dictionary.
+
+        If self.rlengths is empty, create a new dict where the length of
+        chromosome will be set as the maximum integer.
+        """
+        if not self.rlengths:
+            self.rlengths = dict([(k, INT_MAX) for k in self.locations.keys()])
+        return self.rlengths
+
     @cython.ccall
     def finalize(self):
         """Resize np arrays for 5' positions and sort them in place
@@ -774,11 +837,29 @@ def finalize(self):
             self.barcodes[c] = self.barcodes[c][indices]
             self.total += np.sum(self.locations[c]['c'])  # self.size[c]
 
-        self.sorted = True
+        self.is_sorted = True
         self.average_template_length = cython.cast(cython.float,
                                                    self.length) / self.total
         return
 
+    @cython.ccall
+    def get_locations_by_chr(self, chromosome: bytes):
+        """Return a np array of left/right/count for certain chromosome.
+
+        """
+        if chromosome in self.locations:
+            return self.locations[chromosome]
+        else:
+            raise Exception("No such chromosome name (%s) in TrackI object!\n" % (chromosome))
+
+    @cython.ccall
+    def get_chr_names(self) -> set:
+        """Return all the chromosome names in this track object as a
+        python set.
+
+        """
+        return set(self.locations.keys())
+
     @cython.ccall
     def sort(self):
         """Naive sorting for locations.
@@ -794,7 +875,7 @@ def sort(self):
             indices = np.argsort(self.locations[c], order=['l', 'r'])
             self.locations[c] = self.locations[c][indices]
             self.barcodes[c] = self.barcodes[c][indices]
-        self.sorted = True
+        self.is_sorted = True
         return
 
     @cython.ccall
@@ -849,7 +930,7 @@ def subset(self, selected_barcodes: set):
         """Make a subset of PETrackII with only the given barcodes.
 
         Note: the selected_barcodes is a set of barcodes in python
-        bytes. For example, b"ATCTGCTAGTCTACAT"
+        bytes. For example, {b"ATCTGCTAGTCTACAT", b"ATTCTCGATGCAGTCA"}
 
         """
         indices: cnp.ndarray
@@ -875,23 +956,20 @@ def subset(self, selected_barcodes: set):
 
         # pass some values from self to ret
         ret.annotation = self.annotation
-        ret.sorted = self.sorted
+        ret.is_sorted = self.is_sorted
         ret.rlengths = self.rlengths
         ret.buffer_size = self.buffer_size
         ret.total = 0
         ret.length = 0
         ret.average_template_length = 0
-        ret.destroyed = True
+        ret.is_destroyed = True
 
         for chromosome in sorted(chrs):
-            indices = np.where(np.isin(self.barcodes[chromosome], list(selected_barcodes_n)))[0]
-            print(chrs, indices)
+            indices = np.where(np.isin(self.barcodes[chromosome],
+                                       list(selected_barcodes_n)))[0]
             ret.barcodes[chromosome] = self.barcodes[chromosome][indices]
-            print(ret.barcodes)
             ret.locations[chromosome] = self.locations[chromosome][indices]
-            print(ret.locations)
             ret.size[chromosome] = len(ret.locations[chromosome])
-            print(ret.size)
             ret.buf_size[chromosome] = ret.size[chromosome]
             ret.total += np.sum(ret.locations[chromosome]['c'])
             ret.length += np.sum((ret.locations[chromosome]['r'] -
@@ -900,58 +978,24 @@ def subset(self, selected_barcodes: set):
         ret.average_template_length = ret.length / ret.total
         return ret
 
-    @cython.ccall
-    def sample_percent(self, percent: cython.float, seed: cython.int = -1):
-        raise NotImplementedError("This function is disabled PETrackII")
-
-    @cython.ccall
-    def sample_percent_copy(self, percent: cython.float, seed: cython.int = -1):
-        raise NotImplementedError("This function is disabled PETrackII")
-
-    @cython.ccall
-    def sample_num(self, samplesize: cython.ulong, seed: cython.int = -1):
-        raise NotImplementedError("This function is disabled PETrackII")
-
-    @cython.ccall
-    def sample_num_copy(self, samplesize: cython.ulong, seed: cython.int = -1):
-        raise NotImplementedError("This function is disabled PETrackII")
-
     @cython.ccall
     def pileup_a_chromosome(self,
-                            chrom: bytes,
-                            scale_factor_s: list,
-                            baseline_value: cython.float = 0.0) -> list:
-        """pileup a certain chromosome, return [p,v] (end position and
-        pileup value) list.
-
-        scale_factor_s : linearly scale the pileup value applied to
-                         each d in ds. The list should have the same
-                         length as ds.
-
-        baseline_value : a value to be filled for missing values, and
-                         will be the minimum pileup.
-
+                            chrom: bytes) -> cnp.ndarray:
+        """pileup a certain chromosome, return p-v ndarray (end
+        position and pileup value).
         """
-        tmp_pileup: list
-        prev_pileup: list
-        scale_factor: cython.float
-
-        prev_pileup = None
-
-        for i in range(len(scale_factor_s)):
-            scale_factor = scale_factor_s[i]
+        return pileup_from_LRC(self.locations[chrom])
 
-            # Can't directly pass partial nparray there since that will mess up with pointer calculation.
-            tmp_pileup = quick_pileup(np.sort(self.locations[chrom]['l']),
-                                      np.sort(self.locations[chrom]['r']),
-                                      scale_factor, baseline_value)
-
-            if prev_pileup:
-                prev_pileup = over_two_pv_array(prev_pileup,
-                                                tmp_pileup,
-                                                func="max")
-            else:
-                prev_pileup = tmp_pileup
+    @cython.ccall
+    def pileup_bdg(self):
+        """Pileup all chromosome and return a bdg object.
+        """
+        bdg: bedGraphTrackI
+        pv: cnp.ndarray
 
-        return prev_pileup
+        bdg = bedGraphTrackI()
+        for chrom in self.get_chr_names():
+            pv = pileup_from_LRC(self.locations[chrom])
+            bdg.add_chrom_data_PV(chrom, pv)
 
+        return bdg
diff --git a/MACS3/Signal/Pileup.py b/MACS3/Signal/Pileup.py
index fcff7ce7..dbb674fe 100644
--- a/MACS3/Signal/Pileup.py
+++ b/MACS3/Signal/Pileup.py
@@ -1,6 +1,6 @@
 # cython: language_level=3
 # cython: profile=True
-# Time-stamp: <2024-10-14 13:42:18 Tao Liu>
+# Time-stamp: <2024-10-14 20:08:53 Tao Liu>
 
 """Module Description: For pileup functions.
 
@@ -24,7 +24,6 @@
 # ------------------------------------
 import numpy as np
 import cython.cimports.numpy as cnp
-from cython.cimports.numpy import int32_t, float32_t
 from cython.cimports.cpython import bool
 
 # ------------------------------------
@@ -60,7 +59,7 @@ def fix_coordinates(poss: cnp.ndarray, rlength: cython.int) -> cnp.ndarray:
     """Fix the coordinates.
     """
     i: cython.long
-    ptr: cython.pointer(int32_t) = cython.cast(cython.pointer(int32_t), poss.data)  # pointer
+    ptr: cython.pointer(cython.int) = cython.cast(cython.pointer(cython.int), poss.data)  # pointer
 
     #ptr = <int32_t *> poss.data
 
@@ -276,10 +275,10 @@ def se_all_in_one_pileup(plus_tags: cnp.ndarray,
     ret_v: cnp.ndarray
 
     # pointers are used for numpy arrays
-    start_poss_ptr: cython.pointer(int32_t)
-    end_poss_ptr: cython.pointer(int32_t)
-    ret_p_ptr: cython.pointer(int32_t)
-    ret_v_ptr: cython.pointer(float32_t)
+    start_poss_ptr: cython.pointer(cython.int)
+    end_poss_ptr: cython.pointer(cython.int)
+    ret_p_ptr: cython.pointer(cython.int)
+    ret_v_ptr: cython.pointer(cython.float)
 
     start_poss = np.concatenate((plus_tags-five_shift, minus_tags-three_shift))
     end_poss = np.concatenate((plus_tags+three_shift, minus_tags+five_shift))
@@ -294,14 +293,14 @@ def se_all_in_one_pileup(plus_tags: cnp.ndarray,
 
     lx = start_poss.shape[0]
 
-    start_poss_ptr = cython.cast(cython.pointer(int32_t), start_poss.data)  # <int32_t *> start_poss.data
-    end_poss_ptr = cython.cast(cython.pointer(int32_t), end_poss.data)  # <int32_t *> end_poss.data
+    start_poss_ptr = cython.cast(cython.pointer(cython.int), start_poss.data)  # <int32_t *> start_poss.data
+    end_poss_ptr = cython.cast(cython.pointer(cython.int), end_poss.data)  # <int32_t *> end_poss.data
 
     ret_p = np.zeros(2 * lx, dtype="i4")
     ret_v = np.zeros(2 * lx, dtype="f4")
 
-    ret_p_ptr = cython.cast(cython.pointer(int32_t), ret_p.data)
-    ret_v_ptr = cython.cast(cython.pointer(float32_t), ret_v.data)
+    ret_p_ptr = cython.cast(cython.pointer(cython.int), ret_p.data)
+    ret_v_ptr = cython.cast(cython.pointer(cython.float), ret_v.data)
 
     tmp = [ret_p, ret_v]        # for (endpos,value)
 
@@ -421,19 +420,19 @@ def quick_pileup(start_poss: cnp.ndarray,
     tmp: list
 
     # pointers are used for numpy arrays
-    start_poss_ptr: cython.pointer(int32_t)
-    end_poss_ptr: cython.pointer(int32_t)
-    ret_p_ptr: cython.pointer(int32_t)
-    ret_v_ptr: cython.pointer(float32_t)
+    start_poss_ptr: cython.pointer(cython.int)
+    end_poss_ptr: cython.pointer(cython.int)
+    ret_p_ptr: cython.pointer(cython.int)
+    ret_v_ptr: cython.pointer(cython.float)
 
-    start_poss_ptr = cython.cast(cython.pointer(int32_t), start_poss.data)  # <int32_t *> start_poss.data
-    end_poss_ptr = cython.cast(cython.pointer(int32_t), end_poss.data)  # <int32_t *> end_poss.data
+    start_poss_ptr = cython.cast(cython.pointer(cython.int), start_poss.data)  # <int32_t *> start_poss.data
+    end_poss_ptr = cython.cast(cython.pointer(cython.int), end_poss.data)  # <int32_t *> end_poss.data
 
     ret_p = np.zeros(l, dtype="i4")
     ret_v = np.zeros(l, dtype="f4")
 
-    ret_p_ptr = cython.cast(cython.pointer(int32_t), ret_p.data)
-    ret_v_ptr = cython.cast(cython.pointer(float32_t), ret_v.data)
+    ret_p_ptr = cython.cast(cython.pointer(cython.int), ret_p.data)
+    ret_v_ptr = cython.cast(cython.pointer(cython.float), ret_v.data)
 
     tmp = [ret_p, ret_v]        # for (endpos,value)
 
@@ -530,23 +529,23 @@ def naive_quick_pileup(sorted_poss: cnp.ndarray, extension: int) -> list:
     ret_v: cnp.ndarray
 
     # pointers are used for numpy arrays
-    start_poss_ptr: cython.pointer(int32_t)
-    end_poss_ptr: cython.pointer(int32_t)
-    ret_p_ptr: cython.pointer(int32_t)
-    ret_v_ptr: cython.pointer(float32_t)
+    start_poss_ptr: cython.pointer(cython.int)
+    end_poss_ptr: cython.pointer(cython.int)
+    ret_p_ptr: cython.pointer(cython.int)
+    ret_v_ptr: cython.pointer(cython.float)
 
     start_poss = sorted_poss - extension
     start_poss[start_poss < 0] = 0
     end_poss = sorted_poss + extension
 
-    start_poss_ptr = cython.cast(cython.pointer(int32_t), start_poss.data)  # <int32_t *> start_poss.data
-    end_poss_ptr = cython.cast(cython.pointer(int32_t), end_poss.data)  # <int32_t *> end_poss.data
+    start_poss_ptr = cython.cast(cython.pointer(cython.int), start_poss.data)  # <int32_t *> start_poss.data
+    end_poss_ptr = cython.cast(cython.pointer(cython.int), end_poss.data)  # <int32_t *> end_poss.data
 
     ret_p = np.zeros(2*l, dtype="i4")
     ret_v = np.zeros(2*l, dtype="f4")
 
-    ret_p_ptr = cython.cast(cython.pointer(int32_t), ret_p.data)
-    ret_v_ptr = cython.cast(cython.pointer(float32_t), ret_v.data)
+    ret_p_ptr = cython.cast(cython.pointer(cython.int), ret_p.data)
+    ret_v_ptr = cython.cast(cython.pointer(cython.float), ret_v.data)
 
     if l == 0:
         raise Exception("length is 0")
@@ -627,7 +626,7 @@ def over_two_pv_array(pv_array1: list, pv_array2: list, func: str = "max") -> li
 
     available operations are 'max', 'min', and 'mean'
     """
-    #pre_p: cython.int
+    # pre_p: cython.int
 
     l1: cython.long
     l2: cython.long
@@ -643,12 +642,12 @@ def over_two_pv_array(pv_array1: list, pv_array2: list, func: str = "max") -> li
     ret_v: cnp.ndarray
 
     # pointers are used for numpy arrays
-    a1_pos_ptr: cython.pointer(int32_t)
-    a2_pos_ptr: cython.pointer(int32_t)
-    ret_pos_ptr: cython.pointer(int32_t)
-    a1_v_ptr: cython.pointer(float32_t)
-    a2_v_ptr: cython.pointer(float32_t)
-    ret_v_ptr: cython.pointer(float32_t)
+    a1_pos_ptr: cython.pointer(cython.int)
+    a2_pos_ptr: cython.pointer(cython.int)
+    ret_pos_ptr: cython.pointer(cython.int)
+    a1_v_ptr: cython.pointer(cython.float)
+    a2_v_ptr: cython.pointer(cython.float)
+    ret_v_ptr: cython.pointer(cython.float)
 
     if func == "max":
         f = max
@@ -661,15 +660,15 @@ def over_two_pv_array(pv_array1: list, pv_array2: list, func: str = "max") -> li
 
     [a1_pos, a1_v] = pv_array1
     [a2_pos, a2_v] = pv_array2
-    ret_pos = np.zeros(a1_pos.shape[0] + a2_pos.shape[0], dtype="int32")
-    ret_v = np.zeros(a1_pos.shape[0] + a2_pos.shape[0], dtype="float32")
-
-    a1_pos_ptr = cython.cast(cython.pointer(int32_t), a1_pos.data)
-    a1_v_ptr = cython.cast(cython.pointer(float32_t), a1_v.data)
-    a2_pos_ptr = cython.cast(cython.pointer(int32_t), a2_pos.data)
-    a2_v_ptr = cython.cast(cython.pointer(float32_t), a2_v.data)
-    ret_pos_ptr = cython.cast(cython.pointer(int32_t), ret_pos.data)
-    ret_v_ptr = cython.cast(cython.pointer(float32_t), ret_v.data)
+    ret_pos = np.zeros(a1_pos.shape[0] + a2_pos.shape[0], dtype="i4")
+    ret_v = np.zeros(a1_pos.shape[0] + a2_pos.shape[0], dtype="f4")
+
+    a1_pos_ptr = cython.cast(cython.pointer(cython.int), a1_pos.data)
+    a1_v_ptr = cython.cast(cython.pointer(cython.float), a1_v.data)
+    a2_pos_ptr = cython.cast(cython.pointer(cython.int), a2_pos.data)
+    a2_v_ptr = cython.cast(cython.pointer(cython.float), a2_v.data)
+    ret_pos_ptr = cython.cast(cython.pointer(cython.int), ret_pos.data)
+    ret_v_ptr = cython.cast(cython.pointer(cython.float), ret_v.data)
 
     l1 = a1_pos.shape[0]
     l2 = a2_pos.shape[0]
diff --git a/MACS3/Signal/PileupV2.py b/MACS3/Signal/PileupV2.py
index 62f065f4..299ecf6e 100644
--- a/MACS3/Signal/PileupV2.py
+++ b/MACS3/Signal/PileupV2.py
@@ -1,6 +1,6 @@
 # cython: language_level=3
 # cython: profile=True
-# Time-stamp: <2024-10-04 23:59:48 Tao Liu>
+# Time-stamp: <2024-10-14 21:19:00 Tao Liu>
 
 """Module Description:
 
@@ -34,33 +34,19 @@
 for i from 0 to 2N in PV_sorted:
     1: z = z + v_i
     2: e = p_i
-    3: save the pileup from position s to e is z --  in bedGraph style is to only save (e, z)
+    3: save the pileup from position s to e is z,
+       in bedGraph style is to only save (e, z)
     4: s = e
 
 This code is free software; you can redistribute it and/or modify it
 under the terms of the BSD License (see the file LICENSE included with
 the distribution).
-
 """
-
-# ------------------------------------
-# python modules
-# ------------------------------------
-
-# ------------------------------------
-# MACS3 modules
-# ------------------------------------
-
-# ------------------------------------
-# Other modules
 # ------------------------------------
 import numpy as np
 import cython
 import cython.cimports.numpy as cnp
-from cython.cimports.numpy import int32_t, float32_t, uint64_t
 
-# ------------------------------------
-# C lib
 # ------------------------------------
 # from cython.cimports.libc.stdlib import malloc, free, qsort
 
@@ -70,7 +56,7 @@
 
 
 @cython.ccall
-def mapping_function_always_1(L: int32_t, R: int32_t) -> float32_t:
+def mapping_function_always_1(L: cython.int, R: cython.int) -> cython.float:
     # always return 1, useful while the weight is already 1, or in
     # case of simply piling up fragments for coverage.
     return 1.0
@@ -86,81 +72,6 @@ def clean_up_ndarray(x: cnp.ndarray):
     x.resize(0, refcheck=False)
     return
 
-# ------------------------------------
-# public python functions
-# ------------------------------------
-
-
-@cython.ccall
-def pileup_from_LR_hmmratac(LR_array: cnp.ndarray,
-                            mapping_dict: dict) -> cnp.ndarray:
-    # this function is specifically designed for piling up fragments
-    # for `hmmratac`.
-    #
-    # As for `hmmratac`, the weight depends on the length of the
-    # fragment, aka, value of R-L. Therefore, we need a mapping_dict
-    # for mapping length to weight.
-    l_LR: uint64_t
-    l_PV: uint64_t
-    i: uint64_t
-    L: int32_t
-    R: int32_t
-    PV: cnp.ndarray
-    pileup: cnp.ndarray
-
-    l_LR = LR_array.shape[0]
-    l_PV = 2 * l_LR
-    PV = np.zeros(shape=l_PV, dtype=[('p', 'uint32'), ('v', 'float32')])
-    for i in range(l_LR):
-        (L, R) = LR_array[i]
-        PV[i*2] = (L, mapping_dict[R - L])
-        PV[i*2 + 1] = (R, -1 * mapping_dict[R - L])
-    PV.sort(order='p')
-    pileup = pileup_PV(PV)
-    clean_up_ndarray(PV)
-    return pileup
-
-
-@cython.ccall
-def pileup_from_LR(LR_array: cnp.ndarray,
-                   mapping_func=mapping_function_always_1) -> cnp.ndarray:
-    """This function will pile up the ndarray containing left and
-    right positions, which is typically from PETrackI object. It's
-    useful when generating the pileup of a single chromosome is
-    needed.
-
-    User needs to provide a numpy array of left and right positions,
-    with dtype=[('l','int32'),('r','int32')]. User also needs to
-    provide a mapping function to map the left and right position to
-    certain weight.
-
-    """
-    PV_array: cnp.ndarray
-    pileup: cnp.ndarray
-
-    PV_array = make_PV_from_LR(LR_array, mapping_func=mapping_func)
-    pileup = pileup_PV(PV_array)
-    clean_up_ndarray(PV_array)
-    return pileup
-
-
-@cython.ccall
-def pileup_from_PN(P_array: cnp.ndarray, N_array: cnp.ndarray,
-                   extsize: cython.int) -> cnp.ndarray:
-    """This function will pile up the ndarray containing plus
-    (positive) and minus (negative) positions of all reads, which is
-    typically from FWTrackI object. It's useful when generating the
-    pileup of a single chromosome is needed.
-
-    """
-    PV_array: cnp.ndarray
-    pileup: cnp.ndarray
-
-    PV_array = make_PV_from_PN(P_array, N_array, extsize)
-    pileup = pileup_PV(PV_array)
-    clean_up_ndarray(PV_array)
-    return pileup
-
 
 @cython.cfunc
 def make_PV_from_LR(LR_array: cnp.ndarray,
@@ -170,16 +81,16 @@ def make_PV_from_LR(LR_array: cnp.ndarray,
     `mapping_func( L, R )` or simply 1 if mapping_func is the default.
 
     LR array is an np.ndarray as with dtype
-    [('l','int32'),('r','int32')] with length of N
+    [('l','i4'),('r','i4')] with length of N
 
     PV array is an np.ndarray with
-    dtype=[('p','uint32'),('v','float32')] with length of 2N
+    dtype=[('p','u4'),('v','f4')] with length of 2N
     """
-    l_LR: uint64_t
-    l_PV: uint64_t
-    i: uint64_t
-    L: int32_t
-    R: int32_t
+    l_LR: cython.ulong
+    l_PV: cython.ulong
+    i: cython.ulong
+    L: cython.int
+    R: cython.int
     PV: cnp.ndarray
 
     l_LR = LR_array.shape[0]
@@ -193,6 +104,38 @@ def make_PV_from_LR(LR_array: cnp.ndarray,
     return PV
 
 
+@cython.cfunc
+def make_PV_from_LRC(LRC_array: cnp.ndarray,
+                     mapping_func=mapping_function_always_1) -> cnp.ndarray:
+    """Make sorted PV array from a LR array for certain chromosome in a
+    PETrackII object. The V/weight will be assigned as
+    `mapping_func( L, R )` or simply 1 if mapping_func is the default.
+
+    LRC array is an np.ndarray as with dtype
+    [('l','i4'),('r','i4'),('c','u1')] with length of N
+
+    PV array is an np.ndarray with
+    dtype=[('p','u4'),('v','f4')] with length of 2N
+    """
+    l_LRC: cython.ulong
+    l_PV: cython.ulong
+    i: cython.ulong
+    L: cython.int
+    R: cython.int
+    C: cython.uchar
+    PV: cnp.ndarray
+
+    l_LRC = LRC_array.shape[0]
+    l_PV = 2 * l_LRC
+    PV = np.zeros(shape=l_PV, dtype=[('p', 'u4'), ('v', 'f4')])
+    for i in range(l_LRC):
+        (L, R, C) = LRC_array[i]
+        PV[i*2] = (L, C*mapping_func(L, R))
+        PV[i*2 + 1] = (R, -1.0 * C * mapping_func(L, R))
+    PV.sort(order='p')
+    return PV
+
+
 @cython.cfunc
 def make_PV_from_PN(P_array: cnp.ndarray, N_array: cnp.ndarray,
                     extsize: cython.int) -> cnp.ndarray:
@@ -202,22 +145,22 @@ def make_PV_from_PN(P_array: cnp.ndarray, N_array: cnp.ndarray,
     in this case since all positions should be extended with a fixed
     'extsize'.
 
-    P_array or N_array is an np.ndarray with dtype='int32'
+    P_array or N_array is an np.ndarray with dtype='i4'
 
     PV array is an np.ndarray with
-    dtype=[('p','uint32'),('v','float32')] with length of 2N
+    dtype=[('p','u4'),('v','f4')] with length of 2N
     """
-    l_PN: uint64_t
-    l_PV: uint64_t
-    i: uint64_t
-    L: int32_t
-    R: int32_t
+    l_PN: cython.ulong
+    l_PV: cython.ulong
+    i: cython.ulong
+    L: cython.int
+    R: cython.int
     PV: cnp.ndarray
 
     l_PN = P_array.shape[0]
     assert l_PN == N_array.shape[0]
     l_PV = 4 * l_PN
-    PV = np.zeros(shape=l_PV, dtype=[('p', 'uint32'), ('v', 'float32')])
+    PV = np.zeros(shape=l_PV, dtype=[('p', 'u4'), ('v', 'f4')])
     for i in range(l_PN):
         L = P_array[i]
         R = L + extsize
@@ -246,24 +189,29 @@ def pileup_PV(PV_array: cnp.ndarray) -> cnp.ndarray:
         save the pileup from position s to e is z --  in bedGraph style is to only save (e, z)
         s = e
     """
-    z: float32_t
-    v: float32_t
-    pre_z: float32_t
-    s: uint64_t
-    e: uint64_t
-    i: uint64_t
-    c: uint64_t
-    pileup_PV: cnp.ndarray  # this is in bedGraph style as in Pileup.pyx, p is the end of a region, and v is the pileup value
+    z: cython.float
+    v: cython.float
+    pre_z: cython.float
+    s: cython.ulong
+    e: cython.ulong
+    i: cython.ulong
+    c: cython.ulong
+    # this is in bedGraph style as in Pileup.pyx, p is the end of a
+    # region, and v is the pileup value. It's 
+    pileup_PV: cnp.ndarray
     z = 0
     pre_z = -10000
     s = 0
-    pileup_PV = np.zeros(shape=PV_array.shape[0], dtype=[('p', 'uint32'), ('v', 'float32')])
+    pileup_PV = np.zeros(shape=PV_array.shape[0], dtype=[('p', 'u4'),
+                                                         ('v', 'f4')])
     c = 0
     for i in range(PV_array.shape[0]):
         e = PV_array[i]['p']
         v = PV_array[i]['v']
-        if e != s:              # make sure only to record the final value for the same position
-            if z == pre_z:      # merge the p-v pair with the previous pair if the same v is found
+        # make sure only to record the final value for the same position
+        if e != s:
+            # merge the p-v pair with the previous pair if the same v is found
+            if z == pre_z:
                 pileup_PV[c-1]['p'] = e
             else:
                 pileup_PV[c] = (e, z)
@@ -274,3 +222,102 @@ def pileup_PV(PV_array: cnp.ndarray) -> cnp.ndarray:
     pileup_PV.resize(c, refcheck=False)
     # assert z == 0
     return pileup_PV
+
+# ------------------------------------
+# public python functions
+# ------------------------------------
+
+
+@cython.ccall
+def pileup_from_LR_hmmratac(LR_array: cnp.ndarray,
+                            mapping_dict: dict) -> cnp.ndarray:
+    # this function is specifically designed for piling up fragments
+    # for `hmmratac`.
+    #
+    # As for `hmmratac`, the weight depends on the length of the
+    # fragment, aka, value of R-L. Therefore, we need a mapping_dict
+    # for mapping length to weight.
+    l_LR: cython.ulong
+    l_PV: cython.ulong
+    i: cython.ulong
+    L: cython.int
+    R: cython.int
+    PV: cnp.ndarray
+    pileup: cnp.ndarray
+
+    l_LR = LR_array.shape[0]
+    l_PV = 2 * l_LR
+    PV = np.zeros(shape=l_PV, dtype=[('p', 'u4'), ('v', 'f4')])
+    for i in range(l_LR):
+        (L, R) = LR_array[i]
+        PV[i*2] = (L, mapping_dict[R - L])
+        PV[i*2 + 1] = (R, -1 * mapping_dict[R - L])
+    PV.sort(order='p')
+    pileup = pileup_PV(PV)
+    clean_up_ndarray(PV)
+    return pileup
+
+
+@cython.ccall
+def pileup_from_LR(LR_array: cnp.ndarray,
+                   mapping_func=mapping_function_always_1) -> cnp.ndarray:
+    """This function will pile up the ndarray containing left and
+    right positions, which is typically from PETrackI object. It's
+    useful when generating the pileup of a single chromosome is
+    needed.
+
+    User needs to provide a numpy array of left and right positions,
+    with dtype=[('l','i4'),('r','i4')]. User also needs to
+    provide a mapping function to map the left and right position to
+    certain weight.
+
+    """
+    PV_array: cnp.ndarray
+    pileup: cnp.ndarray
+
+    PV_array = make_PV_from_LR(LR_array, mapping_func=mapping_func)
+    pileup = pileup_PV(PV_array)
+    clean_up_ndarray(PV_array)
+    return pileup
+
+
+@cython.ccall
+def pileup_from_LRC(LRC_array: cnp.ndarray,
+                    mapping_func=mapping_function_always_1) -> cnp.ndarray:
+    """This function will pile up the ndarray containing left and
+    right positions and the counts, which is typically from PETrackII
+    object. It's useful when generating the pileup of a single
+    chromosome is needed.
+
+    User needs to provide a numpy array of left and right positions
+    and the counts, with
+    dtype=[('l','i4'),('r','i4'),('c','u1')]. User also needs to
+    provide a mapping function to map the left and right position to
+    certain weight.
+
+    """
+    PV_array: cnp.ndarray
+    pileup: cnp.ndarray
+
+    PV_array = make_PV_from_LRC(LRC_array, mapping_func=mapping_func)
+    pileup = pileup_PV(PV_array)
+    clean_up_ndarray(PV_array)
+    return pileup
+
+
+@cython.ccall
+def pileup_from_PN(P_array: cnp.ndarray, N_array: cnp.ndarray,
+                   extsize: cython.int) -> cnp.ndarray:
+    """This function will pile up the ndarray containing plus
+    (positive) and minus (negative) positions of all reads, which is
+    typically from FWTrackI object. It's useful when generating the
+    pileup of a single chromosome is needed.
+
+    """
+    PV_array: cnp.ndarray
+    pileup: cnp.ndarray
+
+    PV_array = make_PV_from_PN(P_array, N_array, extsize)
+    pileup = pileup_PV(PV_array)
+    clean_up_ndarray(PV_array)
+    return pileup
diff --git a/setup.py b/setup.py
index f0dc85b6..cce3c579 100644
--- a/setup.py
+++ b/setup.py
@@ -124,7 +124,7 @@ def main():
                              include_dirs=numpy_include_dir,
                              extra_compile_args=extra_c_args),
                    Extension("MACS3.Signal.BedGraph",
-                             ["MACS3/Signal/BedGraph.pyx"],
+                             ["MACS3/Signal/BedGraph.py"],
                              libraries=["m"],
                              include_dirs=numpy_include_dir,
                              extra_compile_args=extra_c_args),
diff --git a/test/test_HMMR_poisson.py b/test/test_HMMR_poisson.py
index 58efdb76..a71f1f99 100644
--- a/test/test_HMMR_poisson.py
+++ b/test/test_HMMR_poisson.py
@@ -1,14 +1,10 @@
-
 import unittest
-import pytest
 # from MACS3.Signal.HMMR_HMM import *
 import numpy as np
 import numpy.testing as npt
-import numpy as np
-import hmmlearn
 from hmmlearn.hmm import PoissonHMM
-from sklearn import cluster
-import json
+# from sklearn import cluster
+# import json
 # class hmmlearn.hmm.PoissonHMM(n_components=1, startprob_prior=1.0, transmat_prior=1.0, lambdas_prior=0.0, lambdas_weight=0.0, 
 #                       algorithm='viterbi', random_state=None, n_iter=10, tol=0.01, verbose=False, params='stl', init_params='stl', implementation='log')
 
@@ -16,21 +12,28 @@
 #                       means_prior=0, means_weight=0, covars_prior=0.01, covars_weight=1, algorithm='viterbi', random_state=None, n_iter=10, tol=0.01, verbose=False, 
 #                       params='stmc', init_params='stmc', implementation='log')
 
-def hmm_training (training_data, training_data_lengths, n_states = 3, random_seed = 12345):
+
+def hmm_training(training_data, training_data_lengths, n_states=3, random_seed=12345):
     rs = np.random.RandomState(np.random.MT19937(np.random.SeedSequence(random_seed)))
-    hmm_model = PoissonHMM( n_components= n_states, random_state = rs, verbose = False )
-    hmm_model = hmm_model.fit( training_data, training_data_lengths )
+    hmm_model = PoissonHMM(n_components=n_states, random_state=rs, verbose=False)
+    hmm_model = hmm_model.fit(training_data, training_data_lengths)
     assert hmm_model.n_features == 4
     return hmm_model
 
-def hmm_predict( signals, lens, hmm_model ):
-    predictions = hmm_model.predict_proba( signals, lens )
+
+def hmm_predict(signals, lens, hmm_model):
+    predictions = hmm_model.predict_proba(signals, lens)
     return predictions
 
+
 class Test_HMM_train_poisson(unittest.TestCase):
-    def setUp( self ):
-        self.training_data = np.loadtxt("test/large_training_data.txt", delimiter="\t", dtype="float", usecols=(2,3,4,5)).astype(int).tolist()
-        self.training_data_lengths = np.loadtxt('test/large_training_lengths.txt', dtype="int").tolist()
+    def setUp(self):
+        self.training_data = np.loadtxt("test/large_training_data.txt",
+                                        delimiter="\t",
+                                        dtype="float",
+                                        usecols=(2, 3, 4, 5)).astype(int).tolist()
+        self.training_data_lengths = np.loadtxt('test/large_training_lengths.txt',
+                                                dtype="int").tolist()
         self.expected_converged = True
         self.not_expected_transmat = None
         self.n_features = 4
@@ -38,40 +41,49 @@ def setUp( self ):
         self.transmat = [[9.87606722e-01, 1.23932782e-02, 1.75299652e-11],
                          [1.76603580e-02, 9.64232293e-01, 1.81073490e-02],
                          [4.87992301e-14, 2.70319349e-02, 9.72968065e-01]]
-        self.lambdas = [[ 0.03809295,  0.62378578,  0.68739807,  0.        ],
-                        [ 0.23243362,  3.4420467,   4.256037,    0.        ],
-                        [ 2.58132377, 11.45924282,  8.13706237,  0.        ]]
+        self.lambdas = [[0.03809295, 0.62378578, 0.68739807, 0.],
+                        [0.23243362, 3.4420467, 4.256037, 0.],
+                        [2.58132377, 11.45924282, 8.13706237, 0.]]
         # for prediction
-        self.prediction_data = np.loadtxt("test/small_prediction_data.txt", delimiter="\t", dtype="float", usecols=(2,3,4,5)).astype(int).tolist()
-        self.prediction_data_lengths = np.loadtxt('test/small_prediction_lengths.txt', dtype="int").tolist()
-        self.predictions = np.loadtxt('test/small_prediction_results_poisson.txt', delimiter="\t", dtype="float").tolist()
+        self.prediction_data = np.loadtxt("test/small_prediction_data.txt",
+                                          delimiter="\t",
+                                          dtype="float",
+                                          usecols=(2, 3, 4, 5)).astype(int).tolist()
+        self.prediction_data_lengths = np.loadtxt('test/small_prediction_lengths.txt',
+                                                  dtype="int").tolist()
+        self.predictions = np.loadtxt('test/small_prediction_results_poisson.txt',
+                                      delimiter="\t", dtype="float").tolist()
 
-    def test_training( self ):
+    def test_training(self):
         # test hmm_training:
-        model = hmm_training(training_data = self.training_data, training_data_lengths = self.training_data_lengths, n_states = 3, random_seed = 12345)
-        print(model.startprob_)
-        print(model.transmat_)
-        print(model.lambdas_)
-        print(model.n_features)
-        self.assertEqual( model.monitor_.converged, self.expected_converged )
-        self.assertNotEqual( model.transmat_.tolist(), self.not_expected_transmat )
-        npt.assert_allclose( model.startprob_.tolist(), self.startprob )
+        model = hmm_training(training_data=self.training_data,
+                             training_data_lengths=self.training_data_lengths,
+                             n_states=3,
+                             random_seed=12345)
+        # print(model.startprob_)
+        # print(model.transmat_)
+        # print(model.lambdas_)
+        # print(model.n_features)
+        self.assertEqual(model.monitor_.converged, self.expected_converged)
+        self.assertNotEqual(model.transmat_.tolist(), self.not_expected_transmat)
+        npt.assert_allclose(model.startprob_.tolist(), self.startprob)
         npt.assert_allclose(model.transmat_, self.transmat)
         npt.assert_allclose(model.lambdas_, self.lambdas)
         npt.assert_allclose(model.n_features, self.n_features)
 
-    def test_predict( self ):
+    def test_predict(self):
         # test hmm_predict
-        hmm_model = PoissonHMM( n_components=3 )
+        hmm_model = PoissonHMM(n_components=3)
         hmm_model.startprob_ = np.array(self.startprob)
         hmm_model.transmat_ = np.array(self.transmat)
         hmm_model.lambdas_ = np.array(self.lambdas)
         hmm_model.n_features = self.n_features
-        predictions = hmm_predict( self.prediction_data, self.prediction_data_lengths, hmm_model )
+        predictions = hmm_predict(self.prediction_data,
+                                  self.prediction_data_lengths,
+                                  hmm_model)
 
-        # This is to write the prediction results into a file for 'correct' answer 
+        # This is to write the prediction results into a file for 'correct' answer
         # with open("test/small_prediction_results_poisson.txt","w") as f:
         #    for x,y,z in predictions:
-        #        f.write( str(x)+"\t"+str(y)+"\t"+str(z)+"\n")
-        
-        npt.assert_allclose( predictions, self.predictions )
+        #        f.write(str(x)+"\t"+str(y)+"\t"+str(z)+"\n")
+        npt.assert_allclose(predictions, self.predictions)
diff --git a/test/test_PairedEndTrack.py b/test/test_PairedEndTrack.py
index c120f7c9..867623a0 100644
--- a/test/test_PairedEndTrack.py
+++ b/test/test_PairedEndTrack.py
@@ -1,14 +1,13 @@
 #!/usr/bin/env python
-# Time-stamp: <2024-10-11 16:20:17 Tao Liu>
+# Time-stamp: <2024-10-14 21:55:05 Tao Liu>
 
 import unittest
-
 from MACS3.Signal.PairedEndTrack import PETrackI, PETrackII
+import numpy as np
 
 
 class Test_PETrackI(unittest.TestCase):
     def setUp(self):
-
         self.input_regions = [(b"chrY", 0, 100),
                               (b"chrY", 70, 270),
                               (b"chrY", 70, 100),
@@ -92,23 +91,39 @@ def setUp(self):
                               (b"chrY", 50, 160, b"0w#AAACGAACAAGTAAGA", 2),
                               (b"chrY", 100, 170, b"0w#AAACGAACAAGTAAGA", 3)
                               ]
+        self.pileup_p = np.array([10, 50, 70, 80, 85, 100, 110, 160, 170, 180, 190], dtype="i4")
+        self.pileup_v = np.array([3.0, 4.0, 6.0, 9.0, 11.0, 15.0, 19.0, 18.0, 16.0, 10.0, 6.0], dtype="f4")
+        self.subset_pileup_p = np.array([10, 50, 70, 80, 85, 100, 110, 160, 170, 180, 190], dtype="i4")
+        self.subset_pileup_v = np.array([1.0, 2.0, 4.0, 6.0, 7.0, 8.0, 13.0, 12.0, 10.0, 5.0, 4.0], dtype="f4")
         self.t = sum([(x[2]-x[1]) * x[4] for x in self.input_regions])
 
     def test_add_frag(self):
         pe = PETrackII()
         for (c, l, r, b, C) in self.input_regions:
-            pe.add_frag(c, l, r, b, C)
+            pe.add_loc(c, l, r, b, C)
         pe.finalize()
         # roughly check the numbers...
         self.assertEqual(pe.total, 22)
         self.assertEqual(pe.length, self.t)
 
-    def test_subset(self):
-        pe = PETrackII()
-        for (c, l, r, b, C) in self.input_regions:
-            pe.add_frag(c, l, r, b, C)
-        pe.finalize()
-        pe_subset = pe.subset(set([b'0w#AAACGAACAAGTAACA', b"0w#AAACGAACAAGTAAGA"]))
+        # subset
+        pe_subset = pe.subset({b'0w#AAACGAACAAGTAACA', b"0w#AAACGAACAAGTAAGA"})
         # roughly check the numbers...
         self.assertEqual(pe_subset.total, 14)
         self.assertEqual(pe_subset.length, 1305)
+
+    def test_pileup(self):
+        pe = PETrackII()
+        for (c, l, r, b, C) in self.input_regions:
+            pe.add_loc(c, l, r, b, C)
+        pe.finalize()
+        bdg = pe.pileup_bdg()
+        d = bdg.get_data_by_chr(b'chrY')  # (p, v) of ndarray
+        np.testing.assert_array_equal(d[0], self.pileup_p)
+        np.testing.assert_array_equal(d[1], self.pileup_v)
+
+        pe_subset = pe.subset({b'0w#AAACGAACAAGTAACA', b"0w#AAACGAACAAGTAAGA"})
+        bdg = pe_subset.pileup_bdg()
+        d = bdg.get_data_by_chr(b'chrY')  # (p, v) of ndarray
+        np.testing.assert_array_equal(d[0], self.subset_pileup_p)
+        np.testing.assert_array_equal(d[1], self.subset_pileup_v)
diff --git a/test/test_PeakIO.py b/test/test_PeakIO.py
index a7af5a52..0a543f8a 100644
--- a/test/test_PeakIO.py
+++ b/test/test_PeakIO.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-# Time-stamp: <2022-09-14 13:33:37 Tao Liu>
+# Time-stamp: <2024-10-14 21:32:21 Tao Liu>
 
 import unittest
 import sys
@@ -49,7 +49,7 @@ def test_exclude(self):
         r1.exclude(r2)
         result = str(r1)
         expected = str(self.exclude2from1)
-        print( "result:\n",result )
-        print( "expected:\n", expected )
+        # print( "result:\n",result )
+        # print( "expected:\n", expected )
         self.assertEqual( result, expected )
 
diff --git a/test/test_Pileup.py b/test/test_Pileup.py
index 6abefa48..02b9b198 100644
--- a/test/test_Pileup.py
+++ b/test/test_Pileup.py
@@ -155,7 +155,7 @@ def test_pileup_1(self):
                                       self.param_1["extension"] )
         result = []
         (p,v) = pileup
-        print(p, v)
+        # print(p, v)
         pnext = iter(p).__next__
         vnext = iter(v).__next__
         pre = 0
@@ -217,7 +217,7 @@ def test_max(self):
         pileup = over_two_pv_array ( self.pv1, self.pv2, func="max" )
         result = []
         (p,v) = pileup
-        print(p, v)
+        # print(p, v)
         pnext = iter(p).__next__
         vnext = iter(v).__next__
         pre = 0
@@ -233,7 +233,7 @@ def test_min(self):
         pileup = over_two_pv_array ( self.pv1, self.pv2, func="min" )
         result = []
         (p,v) = pileup
-        print(p, v)
+        # print(p, v)
         pnext = iter(p).__next__
         vnext = iter(v).__next__
         pre = 0
@@ -249,7 +249,7 @@ def test_mean(self):
         pileup = over_two_pv_array ( self.pv1, self.pv2, func="mean" )
         result = []
         (p,v) = pileup
-        print(p, v)
+        # print(p, v)
         pnext = iter(p).__next__
         vnext = iter(v).__next__
         pre = 0