# Created on Sat Sep 27 16:18:27 2014
# Author: XiaoTao Wang
# Organization: HuaZhong Agricultural University
from __future__ import division
import glob, re, os, sys
import logging
from tadlib.calfea import polygon
import numpy as np
from scipy.stats import poisson, itemfreq
from scipy.spatial import distance
from scipy.interpolate import interp1d, splrep, splev
import scipy.special as special
from sklearn import cluster
## Customize the logger
log = logging.getLogger(__name__)
[docs]def load_TAD(source_fil, chromname=None, cols=[0, 1, 2]):
"""
Load TAD data from a TXT file.
Parameters
----------
source : str
Path to the TAD file.
chromname : None or str
Template of chromosome names.
Suppose ``chromname = 'chr'``, then the source data should be as
follows::
chr1 0 350000
chr1 350000 800000
chr1 800000 1450000
Default: None
cols : list of int (length: 3)
Which columns to read, with 0 being the first. For example,
``cols = [0, 1, 2]`` will extract the 1st, 2nd and 3rd columns.
(Default: [0, 1, 2])
Returns
-------
data : Numpy Structured Array
The parsed TAD intervals are contained in a numpy structured array
containing 3 fields: "chr", "start" and "end".
"""
source = os.path.abspath(os.path.expanduser(source_fil))
# Read TAD data from source
# Skip any comment lines
pool = [line.strip().split() for line in open(source, 'r') if
not line.startswith('#')]
# Skip header lines
pool = [line for line in pool if ((isnumber(line[cols[1]])) and (isnumber(line[cols[2]])))]
# Manipulate chromosome labels
maxL = 0 # Maximum label length
if not chromname:
chromname = ''
for i in range(len(pool)):
pool[i][cols[0]] = pool[i][cols[0]][len(chromname):]
if len(pool[i][cols[0]]) > maxL:
maxL = len(pool[i][cols[0]])
# Standardize API
pool[i] = (pool[i][cols[0]], float(pool[i][cols[1]]),
float(pool[i][cols[2]]))
# Create a structured array
dtype = np.dtype({'names':['chr', 'start', 'end'],
'formats':['U'+str(maxL), np.int, np.int]})
data = np.array(pool, dtype = dtype)
return data
[docs]class Core(object):
"""
Interaction analysis at TAD level.
High IFs off the diagonal region can be identified using
:py:meth:`tadlib.calfea.analyze.Core.longrange`. :py:meth:`tadlib.calfea.analyze.Core.DBSCAN`
performs a density-based clustering algorithm to detect aggregation patterns
in those IFs. Furthermore, two structural features, called AP
(Aggregation Preference) and Coverage in our original research, can be
calculated by :py:meth:`tadlib.calfea.analyze.Core.gdensity` and
:py:meth:`tadlib.calfea.analyze.Core.totalCover` respectively.
Parameters
----------
matrix : numpy.ndarray, (ndim = 2)
Interaction matrix of a TAD.
left : int
Starting point of TAD. For example, if the bin size is 10kb,
``left = 50`` means position 500000(bp) on the genome.
Attributes
----------
newM : numpy.ndarray, (ndim = 2)
Gap-free interaction matrix.
convert : list
Information required for converting *newM* to *matrix*.
cEM : numpy.ndarray, (ndim = 2)
Expected interaction matrix. An upper triangular matrix. Value in each
entry will be used to construct a Poisson Model for statistical
significance calculation.
fE : numpy.ndarray, (ndim = 2)
An upper triangular matrix. Each entry represents the fold enrichment
of corresponding observed interaction frequency.
Ps : numpy.ndarray, (ndim = 2)
An upper triangular matrix. Value in each entry indicates the p-value
under corresponding Poisson Model.
pos : numpy.ndarray, (shape = (N, 2))
Coordinates of the selected IFs in *newM*.
Np : int
Number of the selected IFs.
cluster_id : numpy.ndarray, (shape = (N,))
Cluster labels for each point in *pos*. -1 indicates noisy points.
Nc : int
Cluster number.
clusters : dict
Details of each cluster. "Average density", "radius",
"area (polygon)", "point coordinates", and "object number" are all
recorded.
Hulls : dict
Details of each convex hull (clusters which can be enclosed by a
convex polygon).
ptrace : list of int
Labels of clusters in which all points are collinear. These
clusters cannot be enclosed by a convex polygon.
gden : float, [0, 1]
Weighted average density.
coverage : float, [0, 1]
Total coverage of clusters.
"""
def __init__(self, matrix, left = 0):
# rescale matrix
nonzero = matrix[matrix.nonzero()]
if np.median(nonzero) < 1:
min_nonzero = nonzero.min()
scale = 1 / min_nonzero
matrix = matrix * scale
# Manipulation, remove vacant rows and columns
self.newM, self.convert = manipulation(matrix, left)
## Determine proper off-diagonal level
Len = self.newM.shape[0]
idiag = np.arange(0, Len)
iIFs = []
for i in idiag:
temp = np.diagonal(self.newM, offset = i)
iIFs.append(temp.mean())
iIFs = np.array(iIFs)
idx = np.where(iIFs > 0)[0][0]
self._start = idx
IFs = iIFs[idx:]
diag = idiag[idx:]
self._Ed = _fitting(diag, IFs)
[docs] def longrange(self, pw = 2, ww = 5, top = 0.7, ratio = 0.05):
"""
Select statistically significant interactions of the TAD. Both
genomic distance and local interaction background are taken into
account.
Parameters
----------
pw : int
Width of the peak region. Default: 2
ww : int
Width of the donut. Default: 5
top : float, [0.5, 1]
Parameter for noisy interaction filtering. Default: 0.7
ratio : float, [0.01, 0.1]
Specifies the sample size of significant interactions.
Default: 0.05
Notes
-----
*pw* and *ww* are sensitive to data resolution. It performs well
when we set *pw* to 4 and *ww* to 7 at 5 kb, and (2, 5) at 10 kb. [1]_
References
----------
.. [1] Rao, S.S., Huntley, M.H., Durand, N.C. et al. A 3D map of the
human genome at kilobase resolution reveals principles of chromatin
looping. Cell, 2014, 159: 1665-1680.
"""
dim = self.newM.shape[0]
ps = 2 * pw + 1 # Peak Size
ws = 2 * ww + 1 # Initial window size
bs = 2 * pw + 1 # B -- Blurry
start = ww if (ww > self._start) else self._start
# Upper triangular matrix
upper = np.triu(self.newM, k = start)
bUpper = np.triu(self.newM, k = 0)
# Expanded Matrix
expM = np.zeros((dim + ww*2, dim + ww*2))
expBM = np.zeros((dim + ww*2, dim + ww*2))
expM[ww:-ww, ww:-ww] = upper
expBM[ww:-ww, ww:-ww] = bUpper
tm = np.all((expBM == 0), axis = 0)
Mask = np.zeros((dim + ww*2, dim + ww*2), dtype = bool)
Mask[:,tm] = True
Mask[tm,:] = True
expCM = np.ones_like(expM, dtype = int)
expCM[Mask] = 0
## Expected matrix
EM_idx = np.triu_indices(dim, k = start)
EM_value = self._Ed[EM_idx[1] - EM_idx[0] - self._start]
EM = np.zeros((dim, dim))
EM[EM_idx] = EM_value
## Expanded Expected Matrix
expEM = np.zeros((dim + ww*2, dim + ww*2))
expEM[ww:-ww, ww:-ww] = EM
## Construct pool of matrices for speed
# Window
OPool_w = {}
EPool_w = {}
ss = range(ws)
for i in ss:
for j in ss:
OPool_w[(i,j)] = expM[i:(dim+i), j:(dim+j)]
EPool_w[(i,j)] = expEM[i:(dim+i), j:(dim+j)]
# Peak
OPool_p = {}
EPool_p = {}
ss = range(ww-pw, ps+ww-pw)
for i in ss:
for j in ss:
OPool_p[(i,j)] = expM[i:(dim+i), j:(dim+j)]
EPool_p[(i,j)] = expEM[i:(dim+i), j:(dim+j)]
# For Blurry Matrix
OPool_b = {}
OPool_bc = {}
ss = range(ww-pw, bs+ww-pw)
for i in ss:
for j in ss:
OPool_b[(i,j)] = expBM[i:(dim+i), j:(dim+j)]
OPool_bc[(i,j)] = expCM[i:(dim+i), j:(dim+j)]
## Background Strength --> Background Ratio
bS = np.zeros((dim, dim))
bE = np.zeros((dim, dim))
for w in OPool_w:
if (w[0] != ww) and (w[1] != ww):
bS += OPool_w[w]
bE += EPool_w[w]
for p in OPool_p:
if (p[0] != ww) and (p[1] != ww):
bS -= OPool_p[p]
bE -= EPool_p[p]
bE[bE==0] = 1
bR = bS / bE
## Corrected Expected Matrix
cEM = EM * bR
self.cEM = cEM
## Contruct the Blurry Matrix
BM = np.zeros((dim, dim))
CM = np.zeros((dim, dim), dtype = int)
for b in OPool_b:
BM += OPool_b[b]
CM += OPool_bc[b]
mBM = np.zeros_like(BM)
Mask = CM != 0
mBM[Mask] = BM[Mask] / CM[Mask]
## Fold Enrichments
self.fE = np.zeros_like(self.cEM)
mask = self.cEM != 0
self.fE[mask] = upper[mask] / self.cEM[mask]
## Poisson Models
Poisses = poisson(cEM)
Ps = 1 - Poisses.cdf(upper)
self.Ps = Ps
rmBM = mBM[EM_idx] # Raveled array
# Only consider the top x%
top_idx = np.argsort(rmBM)[np.int(np.floor((1-top)*rmBM.size)):]
# The most significant ones
rPs = Ps[EM_idx][top_idx]
Rnan = np.logical_not(np.isnan(rPs)) # Remove any invalid entry
RrPs = rPs[Rnan]
sig_idx = np.argsort(RrPs)[:np.int(np.ceil(ratio/top*RrPs.size))]
if sig_idx.size > 0:
self.pos = np.r_['1,2,0', EM_idx[0][top_idx][Rnan][sig_idx], EM_idx[1][top_idx][Rnan][sig_idx]]
else:
self.pos = np.array([])
self.Np = len(self.pos)
self._area = EM_idx[0].size
[docs] def DBSCAN(self):
"""Detect natural patterns in selected interactions using DBSCAN.
DBSCAN is a dennsity-based clustering algorithm. [1]_ Two input
parameters *eps* and *MinPts* are calculated in an analytical
way. [2]_ Before further analysis, some basic features are
calculated for each cluster, including "density", "radius" and the
"area", among which, "area" stands for corresponding convex polygon
area.
See Also
--------
sklearn.cluster.DBSCAN : an implementation of DBSCAN
tadlib.calfea.polygon.Polygon : calculations based on polygon.
Notes
-----
Both "radius" and "density" are defined based on core objects of a
cluster. "radius" is the average distance from the core object to its
MinPts-nearest neighbors while "density" is the inverse of it.
References
----------
.. [1] Ester M, Kriegel H, Sander J, Xu X. A density-based
algorithm for discovering clusters in large spatial databases
with noise. Proc. 2nd Int. Conf. on Knowledge Discovery and Data
Mining, Portland, OR, AAAI, Press, 1996, pp. 226-231
.. [2] Daszykowski M, Walczak B, Massart DL. Looking for Natural
Patterns in Data, Part 1: Density Based Approach. Chemom. Intell.
Lab. Syst., 2001, 56: 83-92.
"""
self.Nc = 0
# Trace for scatter plot
self.ptrace = []
# Lower bound for input
if self.Np >= 5:
self._epsilon()
# Minimum epsilon
if self._eps < np.sqrt(2):
self._eps = np.sqrt(2)
# A simple but prerequisite condition
if self._eps > 0:
# Density-based cluster identification
db = cluster.DBSCAN(eps = self._eps,
min_samples = self._MinPts).fit(self.pos)
# Cluster Label, -1 means noise
self.cluster_id = db.labels_.astype(int)
table = itemfreq(self.cluster_id)
mask = (table[:,0] != -1) & (table[:,1] == 1)
ridx = table[mask][:,0]
mask = np.ones(self.Np, dtype = bool)
if len(ridx) > 0:
ridx.sort()
for i in ridx[::-1]:
mask &= (self.cluster_id != i)
for i in ridx[::-1]:
self.cluster_id[self.cluster_id > i] -= 1
self.pos = self.pos[mask]
self.cluster_id = self.cluster_id[mask]
# Number of clusters
self.Nc = len(set(self.cluster_id)) - \
(1 if -1 in self.cluster_id else 0)
if self.Nc > 0:
## Cluster-based attributes
self.clusters = {}
self.clusters['density'] = np.zeros(self.Nc)
self.clusters['radius'] = np.zeros(self.Nc)
self.clusters['areas'] = np.zeros(self.Nc)
self.clusters['obj'] = []
self.clusters['Cn'] = np.zeros(self.Nc, dtype = int)
# Hull-based attributes
self.Hulls = {}
self.Hulls['polygons'] = []
self.Hulls['density'] = []
core_indices = db.core_sample_indices_
cores_mask = np.zeros(self.Np, dtype = bool)
cores_mask[core_indices] = True
cores_mask = cores_mask[mask]
# For each cluster
for i in range(self.Nc):
extract = (self.cluster_id == i)
t_C = self.pos[extract]
# Objects
self.clusters['obj'].append(t_C)
# Total object number
self.clusters['Cn'][i] = extract.sum()
## Average radius / density
# Core points of current cluster
choose = extract & cores_mask
cores = self.pos[choose]
# Distances from core points to any other points in
# current cluster
dists = distance.cdist(cores, t_C)
dists.sort()
# Average radius
self.clusters['radius'][i] = np.mean(dists[:, 1:self._MinPts].sum(axis = -1) / \
(self._MinPts - 1))
# Inverse of average radius, i.e., density
self.clusters['density'][i] = np.mean((self._MinPts - 1) / \
dists[:, 1:self._MinPts].sum(axis = -1))
# Collinear test
judge = polygon.collinear(t_C)
if not judge:
# Represented as a convex polygon
P = polygon.Polygon(t_C)
# Area of the polygon
P.calarea()
self.clusters['areas'][i] = P.area
self.Hulls['polygons'].append(P)
self.Hulls['density'].append(self.clusters['density'][i])
else:
self.ptrace.append(i)
self.clusters['areas'][i] = 0
# Change the number of points
self.Np = len(self.pos)
[docs] def gdensity(self):
"""Weighted density calculation.
:py:meth:`tadlib.calfea.analyze.Core.longrange` and
:py:meth:`tadlib.calfea.analyze.Core.DBSCAN` have to be called in advance.
Density of a TAD is the weighted average density of each cluster.
Weight is the ratio of object number of a cluster to :attr:`Np`.
"""
if self.Nc > 0:
Num = len(self.pos[self.cluster_id != -1])
W = self.clusters['Cn'] / Num
self.gden = np.sum(self.clusters['density'] * W)
else:
self.gden = 0
[docs] def totalCover(self):
"""
Total coverage of clusters.
:py:meth:`tadlib.calfea.analyze.Core.longrange` and
:py:meth:`tadlib.calfea.analyze.Core.DBSCAN` have to be called in advance.
"""
if self.Nc > 0:
csum = self.clusters['areas'].sum()
self.coverage = csum / self._area
else:
self.coverage = 0
[docs] def convertMatrix(self, M):
"""
Convert an internal gap-free matrix(e.g., newM, cEM, fE, and Ps)
into a new matrix with the same shape as the original interaction
matrix by using the recorded index map(see the *convert* attribute).
"""
idx = sorted(self.convert[0].values())
newM = np.zeros((self.convert[1], self.convert[1]), dtype=M.dtype)
y,x = np.meshgrid(idx, idx)
newM[x,y] = M
return newM
def _epsilon(self):
"""Analytical way of estimating input parameters for DBSCAN.
"""
## Neighborhood of a point
if len(self.pos.shape) > 1:
# m objects, n variables
m, n = self.pos.shape
else:
m = self.pos.shape[0]
n = 1
# Minimum number of points considered as a cluster
self._MinPts = np.int(np.ceil(m / 25)) + 1
# Enclose the total space
prod = np.prod(self.pos.max(axis = 0) - self.pos.min(axis = 0))
# Interpolation
gamma = special.gamma(0.5 * n + 1)
denom = (m * np.sqrt(np.pi ** n))
self._eps = ((prod * self._MinPts * gamma) / denom) ** (1.0 / n)
def _fitting(x, y):
## Linear Spline
ip = interp1d(x, y)
# Downsample the data evenly
times = np.arange(2, 4)
scheck = x.size / times
snum = scheck[scheck > 6][-1] if (scheck > 6).sum() > 0 else x.size
xi = np.linspace(x.min(), x.max(), snum)
yi = ip(xi)
## B-spline
tcl = splrep(xi, yi)
ys = splev(x, tcl)
# Finite differences
dy1 = np.gradient(ys)
## Unstable region
m = (dy1[1:] >= 0) & (dy1[:-1] <= 0)
if len(np.where(m)[0]) != 0:
i = np.where(m)[0][0]
ys[x > x[i]] = ys[i]
return ys
def isnumber(value):
"""A numerical string or not.
A string is numerical if it can be converted using **float** function.
Parameters
----------
value : str
Returns
-------
True if value is numerical.
"""
try:
float(value)
except:
return False
else:
return True
def getmatrix(inter, l_bin, r_bin):
"""Extract regional interaction data and place it into a matrix.
Parameters
----------
inter : numpy structured array
Three fields are required, "bin1", "bin2" and "IF", data types of
which are int, int and float, respectively.
l_bin : int
Left bin index of the region.
r_bin : int
Right bin index of the region.
Returns
-------
inter_matrix : numpy.ndarray
The value of each entry is the interaction frequency between
corresponding two bins.
"""
# Construct a matrix
inter_matrix = np.zeros((r_bin - l_bin, r_bin - l_bin), dtype = float)
# Extract the regional data
mask = (inter['bin1'] >= l_bin) & (inter['bin1'] < r_bin) & \
(inter['bin2'] >= l_bin) & (inter['bin2'] < r_bin)
inter_extract = inter[mask]
# Fill the matrix
for i in inter_extract:
# Off-diagonal parts
if i['bin1'] != i['bin2']:
inter_matrix[i['bin1'] - l_bin][i['bin2'] - l_bin] += i['IF']
inter_matrix[i['bin2'] - l_bin][i['bin1'] - l_bin] += i['IF']
else:
# Diagonal part
inter_matrix[i['bin1'] - l_bin][i['bin2'] - l_bin] += i['IF']
return inter_matrix
[docs]def manipulation(matrix, start = 0):
"""Remove gaps of the original interaction matrix.
Parameters
----------
matrix : numpy.ndarray, (ndim = 2)
Interaction matrix.
start : int
The begining of the region. (Default: 0)
Returns
-------
newM : numpy.ndarray, (ndim = 2)
The gap-removed matrix.
convert : list
The first element is the index map from *newM* to *matrix*, and
the second element records the length of *matrix*.
Examples
--------
>>> import numpy as np
>>> from tadlib.calfea.analyze import manipulation
>>> matrix = np.random.rand(4, 4)
>>> matrix[1,:] = 0; matrix[:,1] = 0
>>> print matrix
[[ 0.24822414 0. 0.07782508 0.01812965]
[ 0. 0. 0. 0. ]
[ 0.93870151 0. 0.21986474 0.20462965]
[ 0.13022712 0. 0.78674168 0.77068304]]
>>> newM, convert = manipulation(matrix)
>>> print newM
[[ 0.24822414 0.07782508 0.01812965]
[ 0.93870151 0.21986474 0.20462965]
[ 0.13022712 0.78674168 0.77068304]]
>>> print convert
[{0: 0, 1: 2, 2: 3}, 4]
"""
mask = matrix.sum(axis = 0) == 0
index = list(np.where(mask)[0])
# Remove vacant rows
temp = np.delete(matrix, index, 0)
# Vacant columns
newM = np.delete(temp, index, 1)
mapidx = dict(zip(np.arange(len(newM)),
np.where(np.logical_not(mask))[0] + start))
convert = [mapidx, matrix.shape[0]]
return newM, convert