"""
This module provides functions for using Orca models for various
types of the predictions. This is the main module that you need for
interacting with Orca models.
To use any of the prediction functions, `load_resources` has to be
called first to load the necessary resources.
The coordinates used in Orca are 0-based, inclusive for the start
coordinate and exclusive for the end coordinate, consistent with
python conventions.
"""
import os
import pathlib
import numpy as np
import torch
from scipy.stats import spearmanr
from selene_utils2 import MemmapGenome, Genomic2DFeatures
import selene_sdk
from selene_sdk.sequences import Genome
from orca_models import H1esc, Hff, H1esc_1M, Hff_1M, H1esc_256M, Hff_256M
from orca_utils import (
genomeplot,
genomeplot_256Mb,
StructuralChange2,
process_anno,
coord_round,
coord_clip,
)
ORCA_PATH = str(pathlib.Path(__file__).parent.absolute())
model_dict_global, target_dict_global = {}, {}
[docs]def load_resources(models=["32M"], use_cuda=True, use_memmapgenome=True):
"""
Load resources for Orca predictions including the specified
Orca models and hg38 reference genome. It also creates Genomic2DFeatures
objects for experimental micro-C datasets (for comparison with prediction).
Load resourced are accessible as global variables.
The list of globl variables generated is here:
Global Variables
----------------
hg38 : selene_utils2.MemmapGenome or selene_sdk.sequences.Genome
If `use_memmapgenome==True` and the resource file for hg38 mmap exists,
use MemmapGenome instead of Genome.
h1esc : orca_models.H1esc
1-32Mb Orca H1-ESC model
hff : orca_models.Hff
1-32Mb Orca HFF model
h1esc_256m : orca_models.H1esc_256M
32-256Mb Orca H1-ESC model
hff_256m : orca_models.Hff_256M
32-256Mb Orca HFF model
h1esc_1m : orca_models.H1esc_1M
1Mb Orca H1-ESC model
hff_1m : orca_models.Hff_1M
1Mb Orca HFF model
target_h1esc : selene_utils2.Genomic2DFeatures
Genomic2DFeatures object that load H1-ESC micro-C dataset 4DNFI9GMP2J8
at 4kb resolution, used for comparison with 1-32Mb models.
target_hff : selene_utils2.Genomic2DFeatures
Genomic2DFeatures object that load HFF micro-C dataset 4DNFI643OYP9
at 4kb resolution, used for comparison with 1-32Mb models.
target_h1esc_256m : selene_utils2.Genomic2DFeatures
Genomic2DFeatures object that load H1-ESC micro-C dataset 4DNFI9GMP2J8
at 32kb resolution, used for comparison with 32-256Mb models.
target_hff_256m : selene_utils2.Genomic2DFeatures
Genomic2DFeatures object that load HFF micro-C dataset 4DNFI643OYP9
at 32kb resolution, used for comparison with 32-256Mb models.
target_h1esc_1m : selene_utils2.Genomic2DFeatures
Genomic2DFeatures object that load H1-ESC micro-C dataset 4DNFI9GMP2J8
at 32kb resolution, used for comparison with 1Mb models.
target_hff_1m : selene_utils2.Genomic2DFeatures
Genomic2DFeatures object that load HFF micro-C dataset 4DNFI643OYP9
at 1kb resolution, used for comparison with 1Mb models.
target_available : bool
Indicate whether the micro-C dataset resource file is available.
Parameters
----------
models : list(str)
List of model types to load, supported model types includes
"32M", "256M", "1M", corresponding to 1-32Mb, 32-256Mb, and 1Mb
models. Lower cases are also accepted.
use_cuda : bool, optional
Default is True. If true, loaded models are moved to GPU.
use_memmapgenome : bool, optional
Default is True. If True and the resource file for hg38 mmap exists,
use MemmapGenome instead of Genome.
"""
global hg38, target_hff, target_h1esc, target_hff_256m, target_h1esc_256m, target_hff_1m, target_h1esc_1m, target_available
if "32M" or "32m" in models:
global h1esc, hff
h1esc = H1esc()
h1esc.eval()
hff = Hff()
hff.eval()
if use_cuda:
h1esc.cuda()
hff.cuda()
else:
h1esc.cpu()
hff.cpu()
model_dict_global["h1esc"] = h1esc
model_dict_global["hff"] = hff
if "1M" or "1m" in models:
global h1esc_1m, hff_1m
h1esc_1m = H1esc_1M()
h1esc_1m.eval()
hff_1m = Hff_1M()
hff_1m.eval()
if use_cuda:
h1esc_1m.cuda()
hff_1m.cuda()
else:
h1esc_1m.cpu()
hff_1m.cpu()
model_dict_global["h1esc_1m"] = h1esc_1m
model_dict_global["hff_1m"] = hff_1m
if "256M" or "256m" in models:
global h1esc_256m, hff_256m
h1esc_256m = H1esc_256M()
h1esc_256m.eval()
hff_256m = Hff_256M()
hff_256m.eval()
if use_cuda:
h1esc_256m.cuda()
hff_256m.cuda()
else:
h1esc_256m.cpu()
hff_256m.cpu()
model_dict_global["h1esc_256m"] = h1esc_256m
model_dict_global["hff_256m"] = hff_256m
if (
use_memmapgenome
and pathlib.Path("/resources/Homo_sapiens.GRCh38.dna.primary_assembly.fa.mmap").exists()
):
hg38 = MemmapGenome(
input_path=ORCA_PATH + "/resources/Homo_sapiens.GRCh38.dna.primary_assembly.fa",
memmapfile=ORCA_PATH + "/resources/Homo_sapiens.GRCh38.dna.primary_assembly.fa.mmap",
)
else:
hg38 = Genome(
input_path=ORCA_PATH + "/resources/Homo_sapiens.GRCh38.dna.primary_assembly.fa",
)
target_available = True
if os.path.exists(ORCA_PATH + "/resources/4DNFI643OYP9.rebinned.mcool"):
target_hff = Genomic2DFeatures(
[ORCA_PATH + "/resources/4DNFI643OYP9.rebinned.mcool::/resolutions/4000"],
["r4000"],
(8000, 8000),
cg=True,
)
target_hff_256m = Genomic2DFeatures(
[ORCA_PATH + "/resources/4DNFI643OYP9.rebinned.mcool::/resolutions/32000"],
["r32000"],
(8000, 8000),
cg=True,
)
target_hff_1m = Genomic2DFeatures(
[ORCA_PATH + "/resources/4DNFI643OYP9.rebinned.mcool::/resolutions/1000"],
["r1000"],
(8000, 8000),
cg=True,
)
target_dict_global['hff'] = target_hff
target_dict_global['hff_256m'] = target_hff_256m
target_dict_global['hff_1m'] = target_hff_1m
else:
target_available = False
if os.path.exists(ORCA_PATH + "/resources/4DNFI9GMP2J8.rebinned.mcool"):
target_h1esc = Genomic2DFeatures(
[ORCA_PATH + "/resources/4DNFI9GMP2J8.rebinned.mcool::/resolutions/4000"],
["r4000"],
(8000, 8000),
cg=True,
)
target_h1esc_256m = Genomic2DFeatures(
[ORCA_PATH + "/resources/4DNFI9GMP2J8.rebinned.mcool::/resolutions/32000"],
["r32000"],
(8000, 8000),
cg=True,
)
target_h1esc_1m = Genomic2DFeatures(
[ORCA_PATH + "/resources/4DNFI9GMP2J8.rebinned.mcool::/resolutions/1000"],
["r1000"],
(8000, 8000),
cg=True,
)
target_dict_global['h1esc'] = target_h1esc
target_dict_global['h1esc_256m'] = target_h1esc_256m
target_dict_global['h1esc_1m'] = target_h1esc_1m
else:
target_available = False
[docs]def genomepredict(
sequence, mchr, mpos=-1, wpos=-1, models=["h1esc", "hff"], targets=None, annotation=None, use_cuda=True, nan_thresh=1,
):
"""Multiscale prediction for a 32Mb sequence
input, zooming into the position specified when generating a series
of 32Mb, 16Mb, 8Mb, 4Mb, 2Mb and 1Mb predictions with increasing
resolutions (up to 4kb). This function also processes
information used only for plotting including targets and annotation.
For larger sequence and interchromosomal predictions, you can use
256Mb input with genomepredict_256Mb.
Parameters
----------
sequence : numpy.ndarray
One-hot sequence encoding of shape 1 x 4 x 32000000.
The encoding can be generated with `selene_sdk.Genome.sequence_to_encoding()`.
mchr : str
Chromosome name. This is used for annotation purpose only.
mpos : int, optional
The coordinate to zoom into for multiscale prediction.
wpos : int, optional
The coordinate of the center position of the sequence, which is
start position + 16000000.
models : list(torch.nn.Module or str), optional
Models to use. Default is H1-ESC and HFF Orca models.
targets : list(numpy.ndarray), optional
The observed balanced contact matrices from the
32Mb region. Used only for plotting when used with genomeplot. The length and
order of the list of targets should match the models specified (default is
H1-ESC and HFF Orca models).
The dimensions of the arrays should be 8000 x 8000 (1kb resolution).
annotation : str or None, optional
List of annotations for plotting. The annotation can be generated with
See orca_utils.process_anno and see its documentation for more details.
use_cuda : bool, optional
Default is True. If False, use CPU.
nan_thresh : int, optional
Default is 1. Specify the threshold of the proportion of NaNs values
allowed during downsampling for the observed matrices. Only relevant for plotting.
The lower resolution observed matrix value are computed by averaging multiple
bins into one. By default, we allow missing values and only average over the
non-missing values, and the values with more than the specified proprotion
of missing values will be filled with NaN.
Returns
----------
output : dict
Result dictionary that can be used as input for genomeplot. The dictionary
has the following keys:
- predictions : list(list(numpy.ndarray), list(numpy.ndarray))
Multi-level predictions for H1-ESC and HFF cell types.
- experiments : list(list(numpy.ndarray), list(numpy.ndarray))
Observations for H1-ESC and HFF cell types that matches the predictions.
Exists if `targets` is specified.
- normmats : list(list(numpy.ndarray), list(numpy.ndarray))
Background distance-based expected balanced contact matrices for
H1-ESC and HFF cell types that matches the predictions.
- start_coords : list(int)
Start coordinates for the prediction at each level.
- end_coords : list(int)
End coordinates for the prediction at each level.
- chr : str
The chromosome name.
- annos : list(list(...))
Annotation information. The format is as outputed by orca_utils.process_anno
Exists if `annotation` is specified.
"""
model_objs = []
for m in models:
if isinstance(m, torch.nn.Module):
model_objs.append(m)
else:
try:
if m in model_dict_global:
model_objs.append(model_dict_global[m])
except KeyError:
load_resources(models=["32M"], use_cuda=use_cuda)
if m in model_dict_global:
model_objs.append(model_dict_global[m])
models = model_objs
n_models = len(models)
with torch.no_grad():
allpreds = []
allstarts = []
if targets:
alltargets = []
if annotation is not None:
allannos = []
for iii, seq in enumerate(
[
torch.FloatTensor(sequence),
torch.FloatTensor(np.ascontiguousarray(sequence[:, ::-1, ::-1])),
]
):
for ii, model in enumerate(models):
if targets and iii == 0:
target = targets[ii]
(encoding1, encoding2, encoding4, encoding8, encoding16, encoding32,) = model.net(
model.net0(torch.Tensor(seq.float()).transpose(1, 2).cuda())
if use_cuda
else model.net0(torch.Tensor(seq.float()).transpose(1, 2))
)
encodings = {
1: encoding1,
2: encoding2,
4: encoding4,
8: encoding8,
16: encoding16,
32: encoding32,
}
def eval_step(level, start, coarse_pred=None):
distenc = torch.log(
torch.FloatTensor(model.normmats[level][None, None, :, :]).cuda()
if use_cuda
else torch.FloatTensor(model.normmats[level][None, None, :, :])
).expand(sequence.shape[0], 1, 250, 250)
if coarse_pred is not None:
if level == 1:
pred = model.denets[level].forward(
encodings[level][
:, :, int(start / level) : int(start / level) + 250
],
distenc,
coarse_pred,
) + model.denet_1_pt.forward(
encodings[level][
:, :, int(start / level) : int(start / level) + 250
]
)
else:
pred = model.denets[level].forward(
encodings[level][
:, :, int(start / level) : int(start / level) + 250
],
distenc,
coarse_pred,
)
else:
pred = model.denets[level].forward(
encodings[level][:, :, int(start / level) : int(start / level) + 250],
distenc,
)
return pred
preds = []
starts = [0]
if targets and iii == 0:
ts = []
if annotation is not None and iii == 0:
annos = []
for j, level in enumerate([32, 16, 8, 4, 2, 1]):
if j == 0:
pred = eval_step(level, starts[j])
else:
pred = eval_step(
level,
starts[j],
preds[j - 1][
:,
:,
start_index : start_index + 125,
start_index : start_index + 125,
],
)
if targets and iii == 0:
target_r = np.nanmean(
np.nanmean(
np.reshape(
target[
:,
starts[j] : starts[j] + 250 * level,
starts[j] : starts[j] + 250 * level,
].numpy(),
(target.shape[0], 250, level, 250, level),
),
axis=4,
),
axis=2,
)
target_nan = np.mean(
np.mean(
np.isnan(
np.reshape(
target[
:,
starts[j] : starts[j] + 250 * level,
starts[j] : starts[j] + 250 * level,
].numpy(),
(target.shape[0], 250, level, 250, level),
)
),
axis=4,
),
axis=2,
)
target_r[target_nan > nan_thresh] = np.nan
target_np = np.log(
(target_r + model.epss[level])
/ (model.normmats[level] + model.epss[level])
)[0, 0:, 0:]
ts.append(target_np)
if annotation is not None and iii == 0:
newstart = starts[j] / 8000.0
newend = (starts[j] + 250 * level) / 8000.0
anno_r = []
for r in annotation:
if len(r) == 3:
if not (r[0] >= newend or r[1] <= newstart):
anno_r.append(
(
np.fmax((r[0] - newstart) / (newend - newstart), 0,),
np.fmin((r[1] - newstart) / (newend - newstart), 1,),
r[2],
)
)
else:
if r[0] >= newstart and r[0] < newend:
anno_r.append(((r[0] - newstart) / (newend - newstart), r[1]))
annos.append(anno_r)
if iii == 0:
start_index = int(
np.clip(
np.floor(
(
(mpos - level * 1000000 / 4)
- (wpos - 16000000 + starts[j] * 4000)
)
/ (4000 * level)
),
0,
125,
)
)
else:
start_index = int(
np.clip(
np.ceil(
(
(wpos + 16000000 - starts[j] * 4000)
- (mpos + level * 1000000 / 4)
)
/ (4000 * level)
),
0,
125,
)
)
starts.append(starts[j] + start_index * level)
preds.append(pred)
allpreds.append(preds)
if iii == 0:
if targets:
alltargets.append(ts)
if annotation is not None:
allannos.append(annos)
allstarts.append(starts[:-1])
output = {}
output["predictions"] = [[] for _ in range(n_models)]
for i in range(n_models):
for j in range(len(allpreds[i])):
if allpreds[i][j].shape[1] == 1:
output["predictions"][i].append(
allpreds[i][j].cpu().detach().numpy()[0, 0, :, :] * 0.5
+ allpreds[i + n_models][j].cpu().detach().numpy()[0, 0, ::-1, ::-1] * 0.5
)
else:
output["predictions"][i].append(
allpreds[i][j].cpu().detach().numpy()[0, :, :, :] * 0.5
+ allpreds[i + n_models][j].cpu().detach().numpy()[0, :, ::-1, ::-1] * 0.5
)
if targets:
output["experiments"] = alltargets
else:
output["experiments"] = None
output["start_coords"] = [wpos - 16000000 + s * 4000 for s in allstarts[0]]
output["end_coords"] = [
int(output["start_coords"][ii] + 32000000 / 2 ** (ii)) for ii in range(6)
]
output["chr"] = mchr
if annotation is not None:
output["annos"] = allannos[0]
else:
output["annos"] = None
output["normmats"] = [
[model.normmats[ii] for ii in [32, 16, 8, 4, 2, 1]] for model in models
]
return output
[docs]def genomepredict_256Mb(
sequence,
mchr,
normmats,
chrlen,
mpos=-1,
wpos=-1,
models=["h1esc_256m", "hff_256m"],
targets=None,
annotation=None,
padding_chr=None,
use_cuda=True,
nan_thresh=1,
):
"""Multiscale prediction for a 256Mb sequence
input, zooming into the position specified when generating a series
of 256Mb, 128Mb, 64Mb, and 32Mb predictions with increasing
resolutions (up to 128kb). This function also processes
information used only for plotting including targets and annotation.
This function accepts multichromosal input sequence. Thus it needs an
extra input `normmats` to encode the chromosomal information. See documentation
for normmats argument for details.
Parameters
----------
sequence : numpy.ndarray
One-hot sequence encoding of shape 1 x 4 x 256000000.
The encoding can be generated with `selene_sdk.Genome.sequence_to_encoding()`.
mchr : str
The chromosome name of the first chromosome included in the seqeunce.
This is used for annotation purpose only.
normmats : list(numpy.ndarray)
A list of distance-based background matrices for H1-ESC and HFF.The
normmats contains arrays with dimensions 8000 x 8000 (32kb resolution).
Interchromosomal interactions are filled with the expected balanced contact
score for interchromomsal interactions.
chrlen : int
The coordinate of the end of the first chromosome in the input, which is the
chromosome that will be zoomed into.
mpos : int, optional
Default is -1. The coordinate to zoom into for multiscale prediction. If neither
`mpos` nor `wpos` are specified, it zooms into the center of the input by default.
wpos : int, optional
Default is -1. The coordinate of the center position of the sequence, which is
start position + 16000000. If neither `mpos` nor `wpos` are specified, it zooms
into the center of the input by default.
models : list(torch.nn.Module or str), optional
Models to use. Default is H1-ESC(256Mb) and HFF(256Mb) Orca models.
targets : list(numpy.ndarray), optional
The observed balanced contact matrices from the 256Mb sequence.
Used only for plotting when used with genomeplot. The length and
order of the list of targets should match the models specified (default is
H1-ESC and HFF Orca models). The dimensions of the arrays should be
8000 x 8000 (32kb resolution).
annotation : str or None, optional
Default is None. List of annotations for plotting. The annotation can be generated with
See orca_utils.process_anno and see its documentation for more details.
padding_chr : str, None, optional
Default is None. Name of the padding chromosome after the first. Used for annotation
only. TODO: be more flexible in the support for multiple chromosomes.
use_cuda : bool, optional
Default is True. If False, use CPU.
nan_thresh : int, optional
Default is 1. Specify the threshold of the proportion of NaNs values
allowed during downsampling for the observed matrices. Only relevant for plotting.
The lower resolution observed matrix value are computed by averaging multiple
bins into one. By default, we allow missing values and only average over the
non-missing values, and the values with more than the specified proprotion
of missing values will be filled with NaN.
Returns
----------
output : dict
Result dictionary that can be used as input for genomeplot. The dictionary
has the following keys:
- predictions : list(list(numpy.ndarray), list(numpy.ndarray))
Multi-level predictions for H1-ESC and HFF cell types.
- experiments : list(list(numpy.ndarray), list(numpy.ndarray))
Observations for H1-ESC and HFF cell types that matches the predictions.
Exists if `targets` is specified.
- normmats : list(list(numpy.ndarray), list(numpy.ndarray))
Background distance-based expected balanced contact matrices for
H1-ESC and HFF cell types that matches the predictions.
- start_coords : list(int)
Start coordinates for the prediction at each level.
- end_coords : list(int)
End coordinates for the prediction at each level.
- chr : str
The chromosome name.
- annos : list(list(...))
Annotation information. The format is as outputed by orca_utils.process_anno
Exists if `annotation` is specified.
"""
model_objs = []
for m in models:
if isinstance(m, torch.nn.Module):
model_objs.append(m)
else:
try:
if m in model_dict_global:
model_objs.append(model_dict_global[m])
except KeyError:
load_resources(models=["256M"], use_cuda=use_cuda)
if m in model_dict_global:
model_objs.append(model_dict_global[m])
models = model_objs
with torch.no_grad():
allpreds = []
allstarts = []
allnormmats = []
if targets:
alltargets = []
if annotation is not None:
allannos = []
for iii, seq in enumerate(
[
torch.FloatTensor(sequence),
torch.FloatTensor(np.ascontiguousarray(sequence[:, ::-1, ::-1])),
]
):
for ii, model in enumerate(models):
normmat = normmats[ii]
normmat_nan = np.isnan(normmat)
if np.any(normmat_nan):
normmat[normmat_nan] = np.nanmin(normmat[~normmat_nan])
if targets and iii == 0:
target = targets[ii]
(encoding32, encoding64, encoding128, encoding256) = model.net(
model.net1(
model.net0(
torch.Tensor(seq.float()).transpose(1, 2).cuda()
if use_cuda
else torch.Tensor(seq.float()).transpose(1, 2)
)
)[-1]
)
encodings = {
32: encoding32,
64: encoding64,
128: encoding128,
256: encoding256,
}
def eval_step(level, start, coarse_pred=None):
distenc = torch.log(
torch.FloatTensor(ns[level][None, :, :]).cuda()
if use_cuda
else torch.FloatTensor(ns[level][None, :, :])
).expand(sequence.shape[0], 1, 250, 250)
if coarse_pred is not None:
pred = model.denets[level].forward(
encodings[level][
:, :, int(start / (level // 8)) : int(start / (level // 8)) + 250,
],
distenc if iii == 0 else torch.flip(distenc, [2, 3]),
coarse_pred,
)
else:
pred = model.denets[level].forward(
encodings[level][
:, :, int(start / (level // 8)) : int(start / (level // 8)) + 250,
],
distenc if iii == 0 else torch.flip(distenc, [2, 3]),
)
return pred
preds = []
starts = [0]
ns = {}
if targets and iii == 0:
ts = []
if annotation is not None and iii == 0:
annos = []
for j, level in enumerate([256, 128, 64, 32]):
normmat_r = np.nanmean(
np.nanmean(
np.reshape(
normmat[
starts[j] : starts[j] + 250 * level // 8,
starts[j] : starts[j] + 250 * level // 8,
],
(1, 250, level // 8, 250, level // 8),
),
axis=4,
),
axis=2,
)
ns[level] = normmat_r
if j == 0:
pred = eval_step(level, starts[j])
else:
pred = eval_step(
level,
starts[j],
preds[j - 1][
:,
:,
start_index : start_index + 125,
start_index : start_index + 125,
],
)
if targets and iii == 0:
target_r = np.nanmean(
np.nanmean(
np.reshape(
target[
:,
starts[j] : starts[j] + 250 * level // 8,
starts[j] : starts[j] + 250 * level // 8,
].numpy(),
(target.shape[0], 250, level // 8, 250, level // 8),
),
axis=4,
),
axis=2,
)
target_nan = np.mean(
np.mean(
np.isnan(
np.reshape(
target[
:,
starts[j] : starts[j] + 250 * level // 8,
starts[j] : starts[j] + 250 * level // 8,
].numpy(),
(target.shape[0], 250, level // 8, 250, level // 8,),
)
),
axis=4,
),
axis=2,
)
target_r[target_nan > nan_thresh] = np.nan
eps = np.nanmin(normmat_r)
target_np = np.log((target_r + eps) / (normmat_r + eps))[0, 0:, 0:]
ts.append(target_np)
if annotation is not None and iii == 0:
newstart = starts[j] / 8000.0
newend = (starts[j] + 250 * level // 8) / 8000.0
anno_r = []
for r in annotation:
if len(r) == 3:
if not (r[0] >= newend or r[1] <= newstart):
anno_r.append(
(
np.fmax((r[0] - newstart) / (newend - newstart), 0,),
np.fmin((r[1] - newstart) / (newend - newstart), 1,),
r[2],
)
)
else:
if r[0] >= newstart and r[0] < newend:
anno_r.append(((r[0] - newstart) / (newend - newstart), r[1]))
annos.append(anno_r)
if iii == 0:
proposed_start = (mpos - level * 1000000 / 4) - (
wpos - 128000000 + starts[j] * 4000 * 8
)
else:
proposed_start = (mpos - level * 1000000 / 4) - (
wpos + 128000000 - starts[j] * 4000 * 8 - level * 1000000
)
if chrlen is not None:
bounds = [
0 - (wpos - 128000000),
chrlen - level * 1000000 / 2 - (wpos - 128000000),
]
if bounds[0] < bounds[1]:
proposed_start = np.clip(proposed_start, bounds[0], bounds[1])
else:
proposed_start = bounds[0]
start_index = int(np.clip(np.floor(proposed_start / (4000 * level)), 0, 125,))
if iii != 0:
start_index = 250 - (start_index + 125)
starts.append(starts[j] + start_index * level // 8)
preds.append(pred)
allpreds.append(preds)
allnormmats.append(ns)
if iii == 0:
if targets:
alltargets.append(ts)
if annotation is not None:
allannos.append(annos)
allstarts.append(starts[:-1])
output = {}
output["predictions"] = [[] for _ in range(n_models)]
for i in range(n_models):
for j in range(len(allpreds[i])):
if allpreds[i][j].shape[1] == 1:
output["predictions"][i].append(
allpreds[i][j].cpu().detach().numpy()[0, 0, :, :] * 0.5
+ allpreds[i + n_models][j].cpu().detach().numpy()[0, 0, ::-1, ::-1] * 0.5
)
else:
output["predictions"][i].append(
allpreds[i][j].cpu().detach().numpy()[0, :, :, :] * 0.5
+ allpreds[i + n_models][j].cpu().detach().numpy()[0, :, ::-1, ::-1] * 0.5
)
if targets:
output["experiments"] = alltargets
else:
output["experiments"] = None
output["start_coords"] = [wpos - 128000000 + s * 32000 for s in allstarts[0]]
output["end_coords"] = [
np.fmin(int(output["start_coords"][ii] + 256000000 / 2 ** (ii)), chrlen) for ii in range(4)
]
if annotation is not None:
output["annos"] = allannos[0]
else:
output["annos"] = None
output["chr"] = mchr
output["padding_chr"] = padding_chr
output["normmats"] = allnormmats
return output
def _retrieve_multi(regionlist, genome, target=True, normmat=True, normmat_regionlist=None):
sequences = []
for region in regionlist:
if len(region) == 4:
chrom, start, end, strand = region
sequences.append(genome.get_encoding_from_coords(chrom, start, end, strand))
else:
chrom, start, end = region
sequences.append(genome.get_encoding_from_coords(chrom, start, end, "+"))
sequence = np.vstack(sequences)[None, :, :]
if isinstance(target, list):
target_objs = target
has_target = True
elif target and target_available:
target_objs = [target_h1esc_256m, target_hff_256m]
has_target = True
else:
has_target = False
if has_target:
targets = []
for target_obj in target_objs:
targets_ = []
for region in regionlist:
if len(region) == 4:
chrom, start, end, strand = region
else:
chrom, start, end = region
strand = "+"
t = []
for region2 in regionlist:
if len(region2) == 4:
chrom2, start2, end2, strand2 = region2
else:
chrom2, start2, end2 = region2
strand = "+"
t.append(
target_obj.get_feature_data(
chrom, start, end, chrom2=chrom2, start2=start2, end2=end2
)
)
if strand == "-":
t[-1] = t[-1][::-1, :]
if strand2 == "-":
t[-1] = t[-1][:, ::-1]
targets_.append(t)
targets_= np.vstack([np.hstack(l) for l in targets_])
targets.append(targets_)
targets = [
torch.FloatTensor(l[None, :, :]) for l in targets
]
if normmat:
if isinstance(normmat, list):
normmat_objs = normmat
else:
normmat_objs = [h1esc_256m, hff_256m]
if normmat_regionlist is None:
normmat_regionlist = regionlist
normmats = []
for normmat_obj in normmat_objs:
normmats_ = []
for chrom, start, end, strand in normmat_regionlist:
b = []
for chrom2, start2, end2, strand2 in normmat_regionlist:
if chrom2 != chrom:
b.append(
np.full(
(int((end - start) / 32000), int((end2 - start2) / 32000)),
normmat_obj.background_trans,
)
)
else:
binsize = 32000
acoor = np.linspace(start, end, int((end - start) / 32000) + 1)[:-1]
bcoor = np.linspace(start2, end2, int((end2 - start2) / 32000) + 1)[:-1]
b.append(
normmat_obj.background_cis[
(np.abs(acoor[:, None] - bcoor[None, :]) / binsize).astype(int)
]
)
if strand == "-":
b[-1] = b[-1][::-1, :]
if strand2 == "-":
b[-1] = b[-1][:, ::-1]
normmats_.append(b)
normmats_ = np.vstack([np.hstack(l) for l in normmats_])
normmats.append(normmats_)
datatuple = (sequence,)
if normmat:
datatuple = datatuple + (normmats,)
if has_target:
datatuple = datatuple + (targets,)
return datatuple
[docs]def process_region(
mchr,
mstart,
mend,
genome,
file=None,
custom_models=None,
target=True,
show_genes=True,
show_tracks=False,
window_radius=16000000,
padding_chr="chr1",
use_cuda=True,
):
"""
Generate multiscale genome interaction predictions for
the specified region.
Parameters
----------
mchr : str
The chromosome name of the first segment
mstart : int
The start coordinate of the region.
mend : ind
The end coordinate of the region.
genome : selene_utils2.MemmapGenome or selene_sdk.sequences.Genome
The reference genome object to extract sequence from
custom_models : list(torch.nn.Module or str) or None, optional
Models to use instead of the default H1-ESC and HFF Orca models.
Default is None.
target : list(selene_utils2.Genomic2DFeatures or str) or bool, optional
If specified as list, use this list of targets to retrieve experimental
data (for plotting only). Default is True and will use micro-C data
for H1-ESC and HFF cells (4DNFI9GMP2J8, 4DNFI643OYP9) that correspond
to the default models.
file : str or None, optional
Default is None. The output file prefix.
show_genes : bool, optional
Default is True. If True, generate gene annotation visualization
file in pdf format that matches the windows of multiscale predictions.
show_tracks : bool, optional
Default is False. If True, generate chromatin tracks visualization
file in pdf format that matches the windows of multiscale predictions.
window_radius : int, optional
Default is 16000000. The acceptable values are 16000000 which selects
the 1-32Mb models or 128000000 which selects the 32-256Mb models.
padding_chr : str, optional
Default is "chr1". If window_radius is 128000000, padding is generally
needed to fill the sequence to 256Mb. The padding sequence will be
extracted from the padding_chr.
use_cuda : bool, optional
Default is True. Use CPU if False.
Returns
-------
outputs_ref_l, outputs_ref_r, outputs_alt : dict, dict, dict
Reference allele predictions zooming into the left boundary of the
duplication,
Reference allele predictions zooming into the right boundary of the
duplication,
Alternative allele predictions zooming into the duplication breakpoint.
The returned results are in the format of dictonaries
containing the prediction outputs and other
retrieved information. These dictionaries can be directly used as
input to genomeplot or genomeplot_256Mb. See documentation of `genomepredict` or `genomepredict_256Mb` for
details of the dictionary content.
"""
chrlen = [l for c, l in genome.get_chr_lens() if c == mchr].pop()
mpos = int((int(mstart) + int(mend)) / 2)
if custom_models is None:
if window_radius == 16000000:
models = ["h1esc", "hff"]
elif window_radius == 128000000:
models = ["h1esc_256m", "hff_256m"]
else:
raise ValueError(
"Only window_radius 16000000 (32Mb models) or 128000000 (256Mb models) are supported"
)
else:
models = custom_models
if target:
try:
if target == True:
if window_radius == 16000000:
target = ["h1esc", "hff"]
elif window_radius == 128000000:
target = ["h1esc_256m", "hff_256m"]
target = [t if isinstance(t, Genomic2DFeatures) else target_dict_global[t] for t in target]
except KeyError:
target = False
if window_radius == 16000000:
wpos = coord_clip(mpos, chrlen)
sequence = genome.get_encoding_from_coords(
mchr, wpos - window_radius, wpos + window_radius
)[None, :]
if target:
targets = [
torch.FloatTensor(
t.get_feature_data(
mchr, coord_round(wpos - window_radius), coord_round(wpos + window_radius),
)[None, :]
)
for t in target
]
else:
targets = None
elif window_radius == 128000000:
chrlen_round = chrlen - chrlen % 32000
wpos = 128000000
if has_target:
sequence, normmats, targets = _retrieve_multi(
[[mchr, 0, chrlen_round, "+"], [padding_chr, 0, 256000000 - chrlen_round, "+"]],
genome,
target=target,
)
else:
sequence, normmats = _retrieve_multi(
[[mchr, 0, chrlen_round, "+"], [padding_chr, 0, 256000000 - chrlen_round, "+"]],
genome,
target=target,
)
targets = None
else:
raise ValueError(
"Only window_radius 16000000 (32Mb models) or 128000000 (256Mb models) are supported"
)
if mstart - mend < 2 * window_radius:
anno_scaled = process_anno(
[
[
np.clip(mstart, wpos - window_radius, wpos + window_radius),
np.clip(mend, wpos - window_radius, wpos + window_radius),
"black",
]
],
base=wpos - window_radius,
window_radius=window_radius,
)
else:
anno_scaled = None
if window_radius == 128000000:
outputs_ref = genomepredict_256Mb(
sequence,
mchr,
normmats,
chrlen_round,
mpos,
wpos,
models=models,
annotation=anno_scaled,
padding_chr=padding_chr,
targets=targets,
use_cuda=use_cuda,
)
else:
outputs_ref = genomepredict(
sequence, mchr, mpos, wpos, annotation=anno_scaled, models=models, targets=targets, use_cuda=use_cuda,
)
if file is not None:
if window_radius == 128000000:
genomeplot_256Mb(
outputs_ref, show_coordinates=True, file=file + ".256m.pdf",
)
else:
genomeplot(
outputs_ref,
show_genes=show_genes,
show_tracks=show_tracks,
show_coordinates=True,
file=file + ".pdf",
)
return outputs_ref
[docs]def process_dup(
mchr,
mstart,
mend,
genome,
file=None,
custom_models=None,
target=True,
show_genes=True,
show_tracks=False,
window_radius=16000000,
padding_chr="chr1",
use_cuda=True,
):
"""
Generate multiscale genome interaction predictions for
an duplication variant.
Parameters
----------
mchr : str
The chromosome name of the first segment
mstart : int
The start coordinate of the duplication.
mend : ind
The end coordinate of the duplication.
genome : selene_utils2.MemmapGenome or selene_sdk.sequences.Genome
The reference genome object to extract sequence from
custom_models : list(torch.nn.Module or str) or None, optional
Models to use instead of the default H1-ESC and HFF Orca models.
Default is None.
target : list(selene_utils2.Genomic2DFeatures or str) or bool, optional
If specified as list, use this list of targets to retrieve experimental
data (for plotting only). Default is True and will use micro-C data
for H1-ESC and HFF cells (4DNFI9GMP2J8, 4DNFI643OYP9) that correspond
to the default models.
file : str or None, optional
Default is None. The output file prefix.
show_genes : bool, optional
Default is True. If True, generate gene annotation visualization
file in pdf format that matches the windows of multiscale predictions.
show_tracks : bool, optional
Default is False. If True, generate chromatin tracks visualization
file in pdf format that matches the windows of multiscale predictions.
window_radius : int, optional
Default is 16000000. The acceptable values are 16000000 which selects
the 1-32Mb models or 128000000 which selects the 32-256Mb models.
padding_chr : str, optional
Default is "chr1". If window_radius is 128000000, padding is generally
needed to fill the sequence to 256Mb. The padding sequence will be
extracted from the padding_chr.
use_cuda : bool, optional
Default is True. Use CPU if False.
Returns
-------
outputs_ref_l, outputs_ref_r, outputs_alt : dict, dict, dict
Reference allele predictions zooming into the left boundary of the
duplication,
Reference allele predictions zooming into the right boundary of the
duplication,
Alternative allele predictions zooming into the duplication breakpoint.
The returned results are in the format of dictonaries
containing the prediction outputs and other
retrieved information. These dictionaries can be directly used as
input to genomeplot or genomeplot_256Mb. See documentation of `genomepredict` or `genomepredict_256Mb` for
details of the dictionary content.
"""
chrlen = [l for c, l in genome.get_chr_lens() if c == mchr].pop()
if custom_models is None:
if window_radius == 16000000:
models = ["h1esc", "hff"]
elif window_radius == 128000000:
models = ["h1esc_256m", "hff_256m"]
else:
raise ValueError(
"Only window_radius 16000000 (32Mb models) or 128000000 (256Mb models) are supported"
)
else:
models = custom_models
if target:
try:
if target == True:
if window_radius == 16000000:
target = ["h1esc", "hff"]
elif window_radius == 128000000:
target = ["h1esc_256m", "hff_256m"]
target = [t if isinstance(t, Genomic2DFeatures) else target_dict_global[t] for t in target]
except KeyError:
target = False
# ref.l
if window_radius == 16000000:
wpos = coord_clip(mstart, chrlen)
sequence = genome.get_encoding_from_coords(
mchr, wpos - window_radius, wpos + window_radius
)[None, :]
if target:
targets = [
torch.FloatTensor(
t.get_feature_data(
mchr, coord_round(wpos - window_radius), coord_round(wpos + window_radius),
)[None, :]
) for t in target
]
else:
targets = None
elif window_radius == 128000000:
chrlen_round = chrlen - chrlen % 32000
wpos = 128000000
if target:
sequence, normmats, targets = _retrieve_multi(
[[mchr, 0, chrlen_round, "+"], [padding_chr, 0, 256000000 - chrlen_round, "+"]],
genome,
target=target,
)
else:
sequence, normmats = _retrieve_multi(
[[mchr, 0, chrlen_round, "+"], [padding_chr, 0, 256000000 - chrlen_round, "+"]],
genome,
target=target,
)
targets = None
else:
raise ValueError(
"Only window_radius 16000000 (32Mb models) or 128000000 (256Mb models) are supported"
)
if wpos + window_radius > mend:
anno_scaled = process_anno(
[[mstart, mend, "black"]], base=wpos - window_radius, window_radius=window_radius
)
else:
anno_scaled = process_anno(
[[mstart, wpos + window_radius, "black"]],
base=wpos - window_radius,
window_radius=window_radius,
)
if window_radius == 128000000:
outputs_ref_l = genomepredict_256Mb(
sequence,
mchr,
normmats,
chrlen_round,
mstart,
wpos,
annotation=anno_scaled,
padding_chr=padding_chr,
models=models,
targets=targets,
use_cuda=use_cuda,
)
else:
outputs_ref_l = genomepredict(
sequence,
mchr,
mstart,
wpos,
annotation=anno_scaled,
models=models,
targets=targets,
use_cuda=use_cuda,
)
if file is not None:
if window_radius == 128000000:
genomeplot_256Mb(
outputs_ref_l, show_coordinates=True, file=file + ".ref.l.256m.pdf",
)
else:
genomeplot(
outputs_ref_l,
show_genes=show_genes,
show_tracks=show_tracks,
show_coordinates=True,
file=file + ".ref.l.pdf",
)
# ref.r
if window_radius == 16000000:
wpos = coord_clip(mend, chrlen)
sequence = genome.get_encoding_from_coords(
mchr, wpos - window_radius, wpos + window_radius
)[None, :]
if target:
targets = [
torch.FloatTensor(
t.get_feature_data(
mchr, coord_round(wpos - window_radius), coord_round(wpos + window_radius),
)[None, :]
) for t in target
]
else:
targets = None
if wpos - window_radius < mstart:
anno_scaled = process_anno(
[[mstart, mend, "black"]], base=wpos - window_radius, window_radius=window_radius
)
else:
anno_scaled = process_anno(
[[wpos - window_radius, mend, "black"]],
base=wpos - window_radius,
window_radius=window_radius,
)
if window_radius == 16000000:
outputs_ref_r = genomepredict(
sequence, mchr, mend, wpos, models=models, annotation=anno_scaled, targets=targets, use_cuda=use_cuda,
)
if file is not None:
genomeplot(
outputs_ref_r,
show_genes=show_genes,
show_tracks=show_tracks,
show_coordinates=True,
file=file + ".ref.r.pdf",
)
else:
outputs_ref_r = genomepredict_256Mb(
sequence,
mchr,
normmats,
chrlen_round,
mend,
wpos,
annotation=anno_scaled,
padding_chr=padding_chr,
models=models,
targets=targets,
use_cuda=use_cuda,
)
genomeplot_256Mb(
outputs_ref_r, show_coordinates=True, file=file + ".ref.r.256m.pdf",
)
# alt (r)
s = StructuralChange2(mchr, chrlen)
s.duplicate(mstart, mend)
chrlen_alt = chrlen + mend - mstart
if window_radius == 16000000:
wpos = coord_clip(mend, chrlen_alt)
sequence = []
for chrm, start, end, strand in s[wpos - window_radius : wpos + window_radius]:
seq = genome.get_encoding_from_coords(chrm, start, end)
if strand == "-":
seq = seq[None, ::-1, ::-1]
else:
seq = seq[None, :, :]
sequence.append(seq)
sequence = np.concatenate(sequence, axis=1)
else:
chrlen_alt_round = chrlen_alt - chrlen_alt % 32000
if chrlen_alt_round < 256000000:
wpos = 128000000
(sequence, normmats) = _retrieve_multi(
list(s[0:chrlen_alt_round]) + [[padding_chr, 0, 256000000 - chrlen_alt_round, "+"]],
genome,
target=False,
normmat=True,
normmat_regionlist=[
[mchr, 0, chrlen_alt_round, "+"],
[padding_chr, 0, 256000000 - chrlen_alt_round, "+"],
],
)
else:
wpos = coord_clip(mend, chrlen_alt_round, window_radius=128000000)
(sequence, normmats) = _retrieve_multi(
list(s[wpos - window_radius : wpos + window_radius]),
genome,
target=False,
normmat=True,
normmat_regionlist=[[mchr, wpos - window_radius, wpos + window_radius, "+"]],
)
if wpos - window_radius < mstart and mend + mend - mstart < wpos + window_radius:
anno_scaled = process_anno(
[[mstart, mend, "black"], [mend, mend + mend - mstart, "gray"]],
base=wpos - window_radius,
window_radius=window_radius,
)
elif wpos - window_radius >= mstart and mend + mend - mstart < wpos + window_radius:
anno_scaled = process_anno(
[[wpos - window_radius, mend, "black"], [mend, mend + mend - mstart, "gray"],],
base=wpos - window_radius,
window_radius=window_radius,
)
elif wpos - window_radius < mstart and mend + mend - mstart >= wpos + window_radius:
anno_scaled = process_anno(
[[mstart, mend, "black"], [mend, wpos + window_radius, "gray"]],
base=wpos - window_radius,
window_radius=window_radius,
)
else:
anno_scaled = process_anno(
[[wpos - window_radius, mend, "black"], [mend, wpos + window_radius, "gray"],],
base=wpos - window_radius,
window_radius=window_radius,
)
if window_radius == 16000000:
outputs_alt = genomepredict(
sequence, mchr, mend, wpos, models=models, annotation=anno_scaled, use_cuda=use_cuda
)
if file is not None:
genomeplot(outputs_alt, show_coordinates=True, file=file + ".alt.pdf")
else:
outputs_alt = genomepredict_256Mb(
sequence,
mchr,
normmats,
chrlen_alt_round,
mend,
wpos,
models=models,
annotation=anno_scaled,
padding_chr=padding_chr,
use_cuda=use_cuda,
)
if file is not None:
genomeplot_256Mb(
outputs_alt, show_coordinates=True, file=file + ".alt.256m.pdf",
)
return outputs_ref_l, outputs_ref_r, outputs_alt
[docs]def process_del(
mchr,
mstart,
mend,
genome,
cmap=None,
file=None,
custom_models=None,
target=True,
show_genes=True,
show_tracks=False,
window_radius=16000000,
padding_chr="chr1",
use_cuda=True,
):
"""
Generate multiscale genome interaction predictions for
an deletion variant.
Parameters
----------
mchr : str
The chromosome name of the first segment
mstart : int
The start coordinate of the deletion.
mend : ind
The end coordinate of the deletion.
genome : selene_utils2.MemmapGenome or selene_sdk.sequences.Genome
The reference genome object to extract sequence from
custom_models : list(torch.nn.Module or str) or None, optional
Models to use instead of the default H1-ESC and HFF Orca models.
Default is None.
target : list(selene_utils2.Genomic2DFeatures or str) or bool, optional
If specified as list, use this list of targets to retrieve experimental
data (for plotting only). Default is True and will use micro-C data
for H1-ESC and HFF cells (4DNFI9GMP2J8, 4DNFI643OYP9) that correspond
to the default models.
file : str or None, optional
Default is None. The output file prefix.
show_genes : bool, optional
Default is True. If True, generate gene annotation visualization
file in pdf format that matches the windows of multiscale predictions.
show_tracks : bool, optional
Default is False. If True, generate chromatin tracks visualization
file in pdf format that matches the windows of multiscale predictions.
window_radius : int, optional
Default is 16000000. The acceptable values are 16000000 which selects
the 1-32Mb models or 128000000 which selects the 32-256Mb models.
padding_chr : str, optional
Default is "chr1". If window_radius is 128000000, padding is generally
needed to fill the sequence to 256Mb. The padding sequence will be
extracted from the padding_chr.
use_cuda : bool, optional
Default is True. Use CPU if False.
Returns
-------
outputs_ref_l, outputs_ref_r, outputs_alt : dict, dict, dict
Reference allele predictions zooming into the left boundary of the
deletion,
Reference allele predictions zooming into the right boundary of the
deletion,
Alternative allele predictions zooming into the deletion breakpoint.
The returned results are in the format of dictonaries
containing the prediction outputs and other
retrieved information. These dictionaries can be directly used as
input to genomeplot or genomeplot_256Mb. See documentation of `genomepredict` or `genomepredict_256Mb` for
details of the dictionary content.
"""
chrlen = [l for c, l in genome.get_chr_lens() if c == mchr].pop()
if custom_models is None:
if window_radius == 16000000:
models = ["h1esc", "hff"]
elif window_radius == 128000000:
models = ["h1esc_256m", "hff_256m"]
else:
raise ValueError(
"Only window_radius 16000000 (32Mb models) or 128000000 (256Mb models) are supported"
)
else:
models = custom_models
if target:
try:
if target == True:
if window_radius == 16000000:
target = ["h1esc", "hff"]
elif window_radius == 128000000:
target = ["h1esc_256m", "hff_256m"]
target = [t if isinstance(t, Genomic2DFeatures) else target_dict_global[t] for t in target]
except KeyError:
target = False
# ref.l
if window_radius == 16000000:
wpos = coord_clip(mstart, chrlen)
sequence = genome.get_encoding_from_coords(
mchr, wpos - window_radius, wpos + window_radius
)[None, :]
if target:
targets = [
torch.FloatTensor(
t.get_feature_data(
mchr, coord_round(wpos - window_radius), coord_round(wpos + window_radius),
)[None, :]
) for t in target
]
else:
targets = None
elif window_radius == 128000000:
chrlen_round = chrlen - chrlen % 32000
wpos = 128000000
if target:
sequence, normmats, targets = _retrieve_multi(
[[mchr, 0, chrlen_round, "+"], [padding_chr, 0, 256000000 - chrlen_round, "+"]],
genome,
target=target,
)
else:
sequence, normmats = _retrieve_multi(
[[mchr, 0, chrlen_round, "+"], [padding_chr, 0, 256000000 - chrlen_round, "+"]],
genome,
target=target,
)
else:
raise ValueError(
"Only window_radius 16000000 (32Mb models) or 128000000 (256Mb models) are supported"
)
if wpos + window_radius > mend:
anno_scaled = process_anno(
[[mstart, mend, "black"]], base=wpos - window_radius, window_radius=window_radius
)
else:
anno_scaled = process_anno(
[[mstart, wpos + window_radius, "black"]],
base=wpos - window_radius,
window_radius=window_radius,
)
if window_radius == 128000000:
outputs_ref_l = genomepredict_256Mb(
sequence,
mchr,
normmats,
chrlen_round,
mstart,
wpos,
models=models,
annotation=anno_scaled,
padding_chr=padding_chr,
targets=targets,
use_cuda=use_cuda,
)
else:
outputs_ref_l = genomepredict(
sequence,
mchr,
mstart,
wpos,
models=models,
annotation=anno_scaled,
targets=targets,
use_cuda=use_cuda,
)
if file is not None:
if window_radius == 128000000:
genomeplot_256Mb(
outputs_ref_l, show_coordinates=True, file=file + ".ref.l.256m.pdf",
)
else:
genomeplot(
outputs_ref_l,
show_genes=show_genes,
show_tracks=show_tracks,
show_coordinates=True,
cmap=cmap,
file=file + ".ref.l.pdf",
)
# ref.r
if window_radius == 16000000:
wpos = coord_clip(mend, chrlen)
sequence = genome.get_encoding_from_coords(
mchr, wpos - window_radius, wpos + window_radius
)[None, :]
if target:
targets = [
torch.FloatTensor(
t.get_feature_data(
mchr, coord_round(wpos - window_radius), coord_round(wpos + window_radius),
)[None, :]
) for t in target
]
else:
targets = None
if wpos - window_radius < mstart:
anno_scaled = process_anno(
[[mstart, mend, "black"]], base=wpos - window_radius, window_radius=window_radius
)
else:
anno_scaled = process_anno(
[[wpos - window_radius, mend, "black"]],
base=wpos - window_radius,
window_radius=window_radius,
)
if window_radius == 16000000:
outputs_ref_r = genomepredict(
sequence, mchr, mend, wpos, models=models, annotation=anno_scaled, targets=targets, use_cuda=use_cuda,
)
if file is not None:
genomeplot(
outputs_ref_r,
show_genes=show_genes,
show_tracks=show_tracks,
show_coordinates=True,
cmap=cmap,
file=file + ".ref.r.pdf",
)
else:
outputs_ref_r = genomepredict_256Mb(
sequence,
mchr,
normmats,
chrlen_round,
mend,
wpos,
models=models,
annotation=anno_scaled,
padding_chr=padding_chr,
targets=targets,
use_cuda=use_cuda,
)
if file is not None:
genomeplot_256Mb(
outputs_ref_r, show_coordinates=True, file=file + ".ref.r.256m.pdf",
)
# alt
s = StructuralChange2(mchr, chrlen)
s.delete(mstart, mend)
chrlen_alt = chrlen - (mend - mstart)
if window_radius == 16000000:
wpos = coord_clip(mstart, chrlen_alt)
sequence = []
for chrm, start, end, strand in s[wpos - window_radius : wpos + window_radius]:
seq = genome.get_encoding_from_coords(chrm, start, end)
if strand == "-":
seq = seq[None, ::-1, ::-1]
else:
seq = seq[None, :, :]
sequence.append(seq)
sequence = np.concatenate(sequence, axis=1)
else:
chrlen_alt_round = chrlen_alt - chrlen_alt % 32000
wpos = 128000000
(sequence, normmats) = _retrieve_multi(
list(s[0:chrlen_alt_round]) + [[padding_chr, 0, 256000000 - chrlen_alt_round, "+"]],
genome,
target=False,
normmat=True,
normmat_regionlist=[
[mchr, 0, chrlen_alt_round, "+"],
[padding_chr, 0, 256000000 - chrlen_alt_round, "+"],
],
)
anno_scaled = process_anno(
[[mstart, "double"]], base=wpos - window_radius, window_radius=window_radius
)
if window_radius == 16000000:
outputs_alt = genomepredict(
sequence, mchr, mstart, wpos, models=models, annotation=anno_scaled, use_cuda=use_cuda
)
if file is not None:
genomeplot(outputs_alt, show_coordinates=True, cmap=cmap, file=file + ".alt.pdf")
else:
outputs_alt = genomepredict_256Mb(
sequence,
mchr,
normmats,
chrlen_alt_round,
mstart,
wpos,
models=models,
annotation=anno_scaled,
padding_chr=padding_chr,
use_cuda=use_cuda,
)
if file is not None:
genomeplot_256Mb(
outputs_alt, show_coordinates=True, file=file + ".alt.256m.pdf",
)
return outputs_ref_l, outputs_ref_r, outputs_alt
[docs]def process_inv(
mchr,
mstart,
mend,
genome,
file=None,
custom_models=None,
target=True,
show_genes=True,
show_tracks=False,
window_radius=16000000,
padding_chr="chr1",
use_cuda=True,
):
"""
Generate multiscale genome interaction predictions for
an inversion variant.
Parameters
----------
mchr : str
The chromosome name of the first segment
mstart : int
The start coordinate of the inversion.
mend : ind
The end coordinate of the inversion.
genome : selene_utils2.MemmapGenome or selene_sdk.sequences.Genome
The reference genome object to extract sequence from
custom_models : list(torch.nn.Module or str) or None, optional
Models to use instead of the default H1-ESC and HFF Orca models.
Default is None.
target : list(selene_utils2.Genomic2DFeatures or str) or bool, optional
If specified as list, use this list of targets to retrieve experimental
data (for plotting only). Default is True and will use micro-C data
for H1-ESC and HFF cells (4DNFI9GMP2J8, 4DNFI643OYP9) that correspond
to the default models.
file : str or None, optional
Default is None. The output file prefix.
show_genes : bool, optional
Default is True. If True, generate gene annotation visualization
file in pdf format that matches the windows of multiscale predictions.
show_tracks : bool, optional
Default is False. If True, generate chromatin tracks visualization
file in pdf format that matches the windows of multiscale predictions.
window_radius : int, optional
Default is 16000000. The acceptable values are 16000000 which selects
the 1-32Mb models or 128000000 which selects the 32-256Mb models.
padding_chr : str, optional
Default is "chr1". If window_radius is 128000000, padding is generally
needed to fill the sequence to 256Mb. The padding sequence will be
extracted from the padding_chr.
use_cuda : bool, optional
Default is True. Use CPU if False.
Returns
-------
outputs_ref_l, outputs_ref_r, outputs_alt_l, outputs_alt_r : dict, dict, dict, dict
Reference allele predictions zooming into the left boundary of the
inversion,
Reference allele predictions zooming into the right boundary of the
inversion,
Alternative allele predictions zooming into the left boundary of
the inversion,
Alternative allele prediction zooming into the right boundary of
the inversion.
The returned results are in the format of dictonaries
containing the prediction outputs and other
retrieved information. These dictionaries can be directly used as
input to genomeplot or genomeplot_256Mb. See documentation of `genomepredict` or `genomepredict_256Mb` for
details of the dictionary content.
"""
chrlen = [l for c, l in genome.get_chr_lens() if c == mchr].pop()
if custom_models is None:
if window_radius == 16000000:
models = ["h1esc", "hff"]
elif window_radius == 128000000:
models = ["h1esc_256m", "hff_256m"]
else:
raise ValueError(
"Only window_radius 16000000 (32Mb models) or 128000000 (256Mb models) are supported"
)
else:
models = custom_models
if target:
try:
if target == True:
if window_radius == 16000000:
target = ["h1esc", "hff"]
elif window_radius == 128000000:
target = ["h1esc_256m", "hff_256m"]
target = [t if isinstance(t, Genomic2DFeatures) else target_dict_global[t] for t in target]
except KeyError:
target = False
if window_radius == 16000000:
wpos = coord_clip(mstart, chrlen)
sequence = genome.get_encoding_from_coords(
mchr, wpos - window_radius, wpos + window_radius
)[None, :]
if target:
targets = [
torch.FloatTensor(
t.get_feature_data(
mchr, coord_round(wpos - window_radius), coord_round(wpos + window_radius),
)[None, :]
) for t in target
]
else:
targets = None
elif window_radius == 128000000:
chrlen_round = chrlen - chrlen % 32000
wpos = 128000000
if target:
sequence, normmats, targets = _retrieve_multi(
[[mchr, 0, chrlen_round, "+"], [padding_chr, 0, 256000000 - chrlen_round, "+"]],
genome,
target=target,
)
else:
sequence, normmats = _retrieve_multi(
[[mchr, 0, chrlen_round, "+"], [padding_chr, 0, 256000000 - chrlen_round, "+"]],
genome,
target=target,
)
targets = None
else:
raise ValueError(
"Only window_radius 16000000 (32Mb models) or 128000000 (256Mb models) are supported"
)
if wpos + window_radius > mend:
anno_scaled = process_anno(
[[mstart, mend, "black"]], base=wpos - window_radius, window_radius=window_radius,
)
else:
anno_scaled = process_anno(
[[mstart, wpos + window_radius, "black"]],
base=wpos - window_radius,
window_radius=window_radius,
)
if window_radius == 128000000:
outputs_ref_l = genomepredict_256Mb(
sequence,
mchr,
normmats,
chrlen_round,
mstart,
wpos,
models=models,
annotation=anno_scaled,
padding_chr=padding_chr,
targets=targets,
use_cuda=use_cuda,
)
else:
outputs_ref_l = genomepredict(
sequence,
mchr,
mstart,
wpos,
models=models,
annotation=anno_scaled,
targets=targets,
use_cuda=use_cuda,
)
if file is not None:
if window_radius == 128000000:
genomeplot_256Mb(
outputs_ref_l, show_coordinates=True, file=file + ".ref.l.256m.pdf",
)
else:
genomeplot(
outputs_ref_l,
show_genes=show_genes,
show_tracks=show_tracks,
show_coordinates=True,
file=file + ".ref.l.pdf",
)
# ref.r
if window_radius == 16000000:
wpos = coord_clip(mend, chrlen)
sequence = genome.get_encoding_from_coords(
mchr, wpos - window_radius, wpos + window_radius
)[None, :]
if target:
targets = [
torch.FloatTensor(
t.get_feature_data(
mchr, coord_round(wpos - window_radius), coord_round(wpos + window_radius),
)[None, :]
) for t in target
]
else:
targets = None
if wpos - window_radius < mstart:
anno_scaled = process_anno(
[[mstart, mend, "black"]], base=wpos - window_radius, window_radius=window_radius,
)
else:
anno_scaled = process_anno(
[[wpos - window_radius, mend, "black"]],
base=wpos - window_radius,
window_radius=window_radius,
)
if window_radius == 16000000:
outputs_ref_r = genomepredict(
sequence, mchr, mend, wpos, models=models, annotation=anno_scaled, targets=targets, use_cuda=use_cuda,
)
if file is not None:
genomeplot(
outputs_ref_r,
show_genes=show_genes,
show_tracks=show_tracks,
show_coordinates=True,
file=file + ".ref.r.pdf",
)
else:
outputs_ref_r = genomepredict_256Mb(
sequence,
mchr,
normmats,
chrlen_round,
mend,
wpos,
models=models,
annotation=anno_scaled,
padding_chr=padding_chr,
targets=targets,
use_cuda=use_cuda,
)
if file is not None:
genomeplot_256Mb(
outputs_ref_r, show_coordinates=True, file=file + ".ref.r.256m.pdf",
)
# alt.l
s = StructuralChange2(mchr, chrlen)
s.invert(mstart, mend)
if window_radius == 16000000:
wpos = coord_clip(mstart, chrlen)
sequence = []
for chrm, start, end, strand in s[wpos - window_radius : wpos + window_radius]:
seq = genome.get_encoding_from_coords(chrm, start, end)
if strand == "-":
seq = seq[None, ::-1, ::-1]
else:
seq = seq[None, :, :]
sequence.append(seq)
sequence = np.concatenate(sequence, axis=1)
else:
wpos = 128000000
(sequence,) = _retrieve_multi(
list(s[0:chrlen_round]) + [[padding_chr, 0, 256000000 - chrlen_round, "+"]],
genome,
target=False,
normmat=False,
)
# normmats are not changed for inversion
if mend < wpos + window_radius:
anno_scaled = process_anno(
[[mstart, mend, "gray"]], base=wpos - window_radius, window_radius=window_radius,
)
else:
anno_scaled = process_anno(
[[mstart, wpos + window_radius, "gray"]],
base=wpos - window_radius,
window_radius=window_radius,
)
if window_radius == 16000000:
outputs_alt_l = genomepredict(
sequence, mchr, mstart, wpos, models=models, annotation=anno_scaled, use_cuda=use_cuda
)
if file is not None:
genomeplot(outputs_alt_l, show_coordinates=True, file=file + ".alt.l.pdf")
else:
outputs_alt_l = genomepredict_256Mb(
sequence,
mchr,
normmats,
chrlen_round,
mstart,
wpos,
models=models,
annotation=anno_scaled,
padding_chr=padding_chr,
use_cuda=use_cuda,
)
if file is not None:
genomeplot_256Mb(
outputs_alt_l, show_coordinates=True, file=file + ".alt.l.256m.pdf",
)
if window_radius == 16000000:
wpos = coord_clip(mend, chrlen)
sequence = []
for chrm, start, end, strand in s[wpos - window_radius : wpos + window_radius]:
seq = genome.get_encoding_from_coords(chrm, start, end)
if strand == "-":
seq = seq[None, ::-1, ::-1]
else:
seq = seq[None, :, :]
sequence.append(seq)
sequence = np.concatenate(sequence, axis=1)
if mstart > wpos - window_radius:
anno_scaled = process_anno(
[[mstart, mend, "gray"]], base=wpos - window_radius, window_radius=window_radius,
)
else:
anno_scaled = process_anno(
[[wpos - window_radius, mend, "gray"]],
base=wpos - window_radius,
window_radius=window_radius,
)
if window_radius == 16000000:
outputs_alt_r = genomepredict(
sequence, mchr, mend, wpos, models=models, annotation=anno_scaled, use_cuda=use_cuda
)
if file is not None:
genomeplot(outputs_alt_r, show_coordinates=True, file=file + ".alt.r.pdf")
else:
outputs_alt_r = genomepredict_256Mb(
sequence,
mchr,
normmats,
chrlen_round,
mend,
wpos,
models=models,
annotation=anno_scaled,
padding_chr=padding_chr,
use_cuda=use_cuda,
)
if file is not None:
genomeplot_256Mb(
outputs_alt_r, show_coordinates=True, file=file + ".alt.r.256m.pdf",
)
return outputs_ref_l, outputs_ref_r, outputs_alt_l, outputs_alt_r
[docs]def process_ins(
mchr,
mpos,
ins_seq,
genome,
strand="+",
file=None,
custom_models=None,
target=True,
show_genes=True,
show_tracks=False,
window_radius=16000000,
padding_chr="chr1",
use_cuda=True,
):
"""
Generate multiscale genome interaction predictions for
an insertion variant that inserts the specified sequence
to the insertion site.
Parameters
----------
mchr : str
The chromosome name of the first segment
mpos : int
The insertion site coordinate.
ins_seq : str
The inserted sequence in string format.
genome : selene_utils2.MemmapGenome or selene_sdk.sequences.Genome
The reference genome object to extract sequence from
custom_models : list(torch.nn.Module or str) or None, optional
Models to use instead of the default H1-ESC and HFF Orca models.
Default is None.
target : list(selene_utils2.Genomic2DFeatures or str) or bool, optional
If specified as list, use this list of targets to retrieve experimental
data (for plotting only). Default is True and will use micro-C data
for H1-ESC and HFF cells (4DNFI9GMP2J8, 4DNFI643OYP9) that correspond
to the default models.
file : str or None, optional
Default is None. The output file prefix.
show_genes : bool, optional
Default is True. If True, generate gene annotation visualization
file in pdf format that matches the windows of multiscale predictions.
show_tracks : bool, optional
Default is False. If True, generate chromatin tracks visualization
file in pdf format that matches the windows of multiscale predictions.
window_radius : int, optional
Default is 16000000. The acceptable values are 16000000 which selects
the 1-32Mb models or 128000000 which selects the 32-256Mb models.
padding_chr : str, optional
Default is "chr1". If window_radius is 128000000, padding is generally
needed to fill the sequence to 256Mb. The padding sequence will be
extracted from the padding_chr.
use_cuda : bool, optional
Default is True. Use CPU if False.
Returns
-------
outputs_ref, outputs_alt_l, outputs_alt_r : dict, dict, dict
Reference allele predictions zooming into the insertion site,
Alternative allele predictions zooming into the left boundary of
the insertion seqeunce,
Alternative allele prediction zooming into the right boundary of
the insertion seqeunce.
The returned results are in the format of dictonaries
containing the prediction outputs and other
retrieved information. These dictionaries can be directly used as
input to genomeplot or genomeplot_256Mb. See documentation of `genomepredict` or `genomepredict_256Mb` for
details of the dictionary content.
"""
chrlen = [l for c, l in genome.get_chr_lens() if c == mchr].pop()
if custom_models is None:
if window_radius == 16000000:
models = ["h1esc", "hff"]
elif window_radius == 128000000:
models = ["h1esc_256m", "hff_256m"]
else:
raise ValueError(
"Only window_radius 16000000 (32Mb models) or 128000000 (256Mb models) are supported"
)
else:
models = custom_models
if target:
try:
if target == True:
if window_radius == 16000000:
target = ["h1esc", "hff"]
elif window_radius == 128000000:
target = ["h1esc_256m", "hff_256m"]
target = [t if isinstance(t, Genomic2DFeatures) else target_dict_global[t] for t in target]
except KeyError:
target = False
if window_radius == 16000000:
wpos = coord_clip(mpos, chrlen)
sequence = genome.get_encoding_from_coords(
mchr, wpos - window_radius, wpos + window_radius
)[None, :]
if target:
targets = [
torch.FloatTensor(
t.get_feature_data(
"chr" + mchr.replace("chr", ""),
coord_round(wpos - window_radius),
coord_round(wpos + window_radius),
)[None, :]
) for t in target
]
else:
targets = None
elif window_radius == 128000000:
chrlen_round = chrlen - chrlen % 32000
wpos = 128000000
if target:
sequence, normmats, targets = _retrieve_multi(
[[mchr, 0, chrlen_round, "+"], [padding_chr, 0, 256000000 - chrlen_round, "+"]],
genome,
target=target,
)
else:
sequence, normmats = _retrieve_multi(
[[mchr, 0, chrlen_round, "+"], [padding_chr, 0, 256000000 - chrlen_round, "+"]],
genome,
target=target,
)
targets = None
else:
raise ValueError(
"Only window_radius 16000000 (32Mb models) or 128000000 (256Mb models) are supported"
)
anno_scaled = process_anno(
[[mpos, "single"]], base=wpos - window_radius, window_radius=window_radius
)
if window_radius == 128000000:
outputs_ref_l = genomepredict_256Mb(
sequence,
mchr,
normmats,
chrlen_round,
mpos,
wpos,
models=models,
annotation=anno_scaled,
padding_chr=padding_chr,
targets=targets,
use_cuda=use_cuda,
)
else:
outputs_ref = genomepredict(
sequence, mchr, mpos, wpos, annotation=anno_scaled, models=models, targets=targets, use_cuda=use_cuda,
)
if file is not None:
if window_radius == 128000000:
genomeplot_256Mb(
outputs_ref_l, show_coordinates=True, file=file + ".ref.256m.pdf",
)
else:
genomeplot(
outputs_ref,
show_genes=show_genes,
show_tracks=show_tracks,
show_coordinates=True,
file=file + ".ref.pdf",
)
# alt
s = StructuralChange2(mchr, chrlen)
s.insert(mpos, len(ins_seq), strand=strand)
chrlen_alt = chrlen + len(ins_seq)
if window_radius == 16000000:
wpos = coord_clip(mpos, chrlen_alt)
sequence = []
for chr_name, start, end, strand in s[wpos - window_radius : wpos + window_radius]:
if chr_name.startswith("ins"):
seq = Genome.sequence_to_encoding(ins_seq[start:end])
else:
seq = genome.get_encoding_from_coords(chr_name, start, end)
if strand == "-":
seq = seq[None, ::-1, ::-1]
else:
seq = seq[None, :, :]
sequence.append(seq)
sequence = np.concatenate(sequence, axis=1)
else:
chrlen_alt_round = chrlen_alt - chrlen_alt % 32000
if chrlen_alt_round < 256000000:
wpos = 128000000
(sequence, normmats) = _retrieve_multi(
list(s[0:chrlen_alt_round]) + [[padding_chr, 0, 256000000 - chrlen_alt_round, "+"]],
genome,
target=False,
normmat=True,
normmat_regionlist=[
[mchr, 0, chrlen_alt_round, "+"],
[padding_chr, 0, 256000000 - chrlen_alt_round, "+"],
],
)
else:
wpos = coord_clip(mpos, chrlen_alt_round, window_radius=128000000)
(sequence, normmats) = _retrieve_multi(
list(s[wpos - window_radius : wpos + window_radius]),
genome,
target=False,
normmat=True,
normmat_regionlist=[[mchr, wpos - window_radius, wpos + window_radius, "+"]],
)
if mpos + len(ins_seq) < wpos + window_radius:
anno_scaled = process_anno(
[[mpos, mpos + len(ins_seq), "gray"]],
base=wpos - window_radius,
window_radius=window_radius,
)
else:
anno_scaled = process_anno(
[[mpos, wpos + window_radius, "gray"]],
base=wpos - window_radius,
window_radius=window_radius,
)
if window_radius == 16000000:
outputs_alt_l = genomepredict(
sequence, mchr, mpos, wpos, models=models, annotation=anno_scaled, use_cuda=use_cuda
)
if file is not None:
genomeplot(outputs_alt_l, show_coordinates=True, file=file + ".alt.l.pdf")
else:
outputs_alt_l = genomepredict_256Mb(
sequence,
mchr,
normmats,
chrlen_alt_round,
mpos,
wpos,
models=models,
annotation=anno_scaled,
padding_chr=padding_chr,
use_cuda=use_cuda,
)
if file is not None:
genomeplot_256Mb(
outputs_alt_l, show_coordinates=True, file=file + ".alt.l.256m.pdf",
)
if window_radius == 16000000:
wpos = coord_clip(mpos + len(ins_seq), chrlen_alt)
sequence = []
for chr_name, start, end, strand in s[wpos - window_radius : wpos + window_radius]:
if chr_name.startswith("ins"):
seq = Genome.sequence_to_encoding(ins_seq[start:end])
else:
seq = genome.get_encoding_from_coords(chr_name, start, end)
if strand == "-":
seq = seq[None, ::-1, ::-1]
else:
seq = seq[None, :, :]
sequence.append(seq)
sequence = np.concatenate(sequence, axis=1)
else:
if chrlen_alt_round > 256000000:
wpos = coord_clip(mpos + len(ins_seq), chrlen_alt_round, window_radius=128000000)
(sequence, normmats) = _retrieve_multi(
list(s[wpos - window_radius : wpos + window_radius]),
genome,
target=False,
normmat=True,
normmat_regionlist=[[mchr, wpos - window_radius, wpos + window_radius, "+"]],
)
if mpos > wpos + window_radius:
anno_scaled = process_anno(
[[mpos, mpos + len(ins_seq), "gray"]],
base=wpos - window_radius,
window_radius=window_radius,
)
else:
anno_scaled = process_anno(
[[wpos - window_radius, mpos + len(ins_seq), "gray"]],
base=wpos - window_radius,
window_radius=window_radius,
)
if window_radius == 16000000:
outputs_alt_r = genomepredict(
sequence, mchr, mpos + len(ins_seq), wpos, annotation=anno_scaled, use_cuda=use_cuda
)
if file is not None:
genomeplot(outputs_alt_r, show_coordinates=True, file=file + ".alt.r.pdf")
else:
outputs_alt = genomepredict_256Mb(
sequence,
mchr,
normmats,
chrlen_alt_round,
mpos + len(ins_seq),
wpos,
models=models,
annotation=anno_scaled,
padding_chr=padding_chr,
use_cuda=use_cuda,
)
if file is not None:
genomeplot_256Mb(
outputs_alt, show_coordinates=True, file=file + ".alt.r.256m.pdf",
)
return outputs_ref, outputs_alt_l, outputs_alt_r
[docs]def process_custom(
region_list,
ref_region_list,
mpos,
genome,
ref_mpos_list=None,
anno_list=None,
ref_anno_list=None,
custom_models=None,
target=True,
file=None,
show_genes=True,
show_tracks=False,
window_radius=16000000,
use_cuda=True,
):
"""
Generate multiscale genome interaction predictions for
a custom variant by an ordered list of genomic segments.
Parameters
----------
region_list : list(list(...))
List of segments to complete the alternative. Each segment is specified
by a list( chr: str, start: int, end: int, strand: str), and segments
are concatenated together in the given order. The total length
should sum up to 32Mb. An example input is
[['chr5', 89411065, 89411065+16000000, '-'], ['chr7', 94378248, 94378248+16000000,'+']].
ref_region_list : list(list(...))
The reference regions to predict. This can be any reference regions with
the length of the specified window size. If the Each reference region is specified
with a list( chr: str, start: int, end: int, strand: str). The strand must
be '+'. The intended use is predicting the genome interactions for each
segment that constitute the alternative allele within the native
reference sequence context. An example
input is [['chr5', 89411065-16000000, 89411065+16000000,'+'],
['chr7', 94378248-16000000, 94378248+16000000,'+']].
mpos : int
The position to zoom into in the alternative allele. Note that `mpos`
here specify the relative position with respect to the to start of the 32Mb.
genome : selene_utils2.MemmapGenome or selene_sdk.sequences.Genome
The reference genome object to extract sequence from.
ref_mpos_list : list(int) or None, optional
Default is None. List of positions to zoom into for each of the
reference regions specified in `ref_region_list`. If not specified,
then zoom into the center of each region. Note that `ref_mpos_list`
specifies the relative positions with respect to start of the 32Mb.
For example, `16000000` means the center of the sequence.
custom_models : list(torch.nn.Module or str) or None, optional
Models to use instead of the default H1-ESC and HFF Orca models.
Default is None.
target : list(selene_utils2.Genomic2DFeatures or str) or bool, optional
If specified as list, use this list of targets to retrieve experimental
data (for plotting only). Default is True and will use micro-C data
for H1-ESC and HFF cells (4DNFI9GMP2J8, 4DNFI643OYP9) that correspond
to the default models.
file : str or None, optional
Default is None. The output file prefix.
show_genes : bool, optional
Default is True. If True, generate gene annotation visualization
file in pdf format that matches the windows of multiscale predictions.
show_tracks : bool, optional
Default is False. If True, generate chromatin tracks visualization
file in pdf format that matches the windows of multiscale predictions.
window_radius : int, optional
Default is 16000000. Currently only 16000000 (32Mb window) is accepted.
use_cuda : bool, optional
Default is True. Use CPU if False.
Returns
-------
outputs_ref_l, outputs_ref_r, outputs_alt : dict, dict, dict
Reference allele predictions zooming into the left boundary of the
duplication,
Reference allele predictions zooming into the right boundary of the
duplication,
Alternative allele predictions zooming into the duplication breakpoint.
The returned results are in the format of dictonaries
containing the prediction outputs and other
retrieved information. These dictionaries can be directly used as
input to genomeplot or genomeplot_256Mb. See documentation of `genomepredict` or `genomepredict_256Mb` for
details of the dictionary content.
"""
if custom_models is None:
if window_radius == 16000000:
models = ["h1esc", "hff"]
elif window_radius == 128000000:
models = ["h1esc_256m", "hff_256m"]
else:
raise ValueError(
"Only window_radius 16000000 (32Mb models) or 128000000 (256Mb models) are supported"
)
else:
models = custom_models
if target:
try:
if target == True:
if window_radius == 16000000:
target = ["h1esc", "hff"]
elif window_radius == 128000000:
target = ["h1esc_256m", "hff_256m"]
target = [t if isinstance(t, Genomic2DFeatures) else target_dict_global[t] for t in target]
except KeyError:
target = False
def validate_region_list(region_list, enforce_strand=None):
sumlen = 0
for chrm, start, end, strand in region_list:
chrlen = [l for c, l in genome.get_chr_lens() if c == chrm].pop()
assert start >= 0 and end <= chrlen
sumlen += end - start
if enforce_strand:
if strand != enforce_strand:
raise ValueError("The specified strand must be " + enforce_strand)
assert sumlen == 2 * window_radius
validate_region_list(region_list)
for i, ref_region in enumerate(ref_region_list):
validate_region_list([ref_region], enforce_strand="+")
ref_sequence = genome.get_encoding_from_coords(*ref_region)[None, :]
if target:
targets = [
torch.FloatTensor(
t.get_feature_data(
ref_region[0], coord_round(ref_region[1]), coord_round(ref_region[2]),
)[None, :]
) for t in target
]
else:
targets = None
anno_scaled = process_anno(ref_anno_list, base=0, window_radius=window_radius)
outputs_ref = genomepredict(
ref_sequence,
ref_region[0],
ref_region[1] + window_radius if ref_mpos_list is None else ref_mpos_list[i],
ref_region[1] + window_radius,
annotation=anno_scaled,
models=models,
targets=targets,
use_cuda=use_cuda,
)
if file is not None:
genomeplot(
outputs_ref,
show_genes=show_genes,
show_tracks=show_tracks,
show_coordinates=True,
file=file + ".ref." + str(i) + ".pdf",
)
sequence = []
for chrm, start, end, strand in region_list:
seq = genome.get_encoding_from_coords(chrm, start, end)
if strand == "-":
seq = seq[None, ::-1, ::-1].copy()
else:
seq = seq[None, :, :]
sequence.append(seq)
alt_sequence = np.concatenate(sequence, axis=1)
anno_scaled = process_anno(anno_list, base=0, window_radius=window_radius)
outputs_alt = genomepredict(
alt_sequence, "chimeric", mpos, window_radius, models=models, annotation=anno_scaled, use_cuda=use_cuda,
)
if file is not None:
genomeplot(outputs_alt, show_coordinates=False, file=file + ".alt.pdf")
return outputs_ref, outputs_alt
[docs]def process_single_breakpoint(
chr1,
pos1,
chr2,
pos2,
orientation1,
orientation2,
genome,
custom_models=None,
target=True,
file=None,
show_genes=True,
show_tracks=False,
window_radius=16000000,
padding_chr="chr1",
use_cuda=True,
):
"""
Generate multiscale genome interaction predictions for
a simple translocation event that connects
two chromosomal breakpoints. Specifically, two breakpoint
positions and the corresponding two orientations are needed.
The orientations decide how the breakpoints are connected.
The ‘+’ or ‘-’ sign indicate whether the left or right side of
the breakpoint is used. For example, for an input
('chr1', 85691449, 'chr5', 89533745 '+', '+'), two plus signs
indicate connecting chr1:0-85691449 with chr5:0-89533745.
Parameters
----------
chr1 : str
The chromosome name of the first segment
pos1 : int
The coorindate of breakpoint on the first segment
chr2 : str
The chromosome name of the second segment
pos2 : int
The coorindate of breakpoint on the second segment
orientation1 : str
Indicate which side of the breakpoint should be used for
the first segment,
'+' indicate the left and '-' indicate the right side.
orientation2 : str
Indicate which side of the breakpoint should be used for
the second segment,
'+' indicate the left and '-' indicate the right side.
genome : selene_utils2.MemmapGenome or selene_sdk.sequences.Genome
The reference genome object to extract sequence from
custom_models : list(torch.nn.Module or str) or None, optional
Models to use instead of the default H1-ESC and HFF Orca models.
Default is None.
target : list(selene_utils2.Genomic2DFeatures or str) or bool, optional
If specified as list, use this list of targets to retrieve experimental
data (for plotting only). Default is True and will use micro-C data
for H1-ESC and HFF cells (4DNFI9GMP2J8, 4DNFI643OYP9) that correspond
to the default models.
file : str or None, optional
Default is None. The output file prefix.
show_genes : bool, optional
Default is True. If True, generate gene annotation visualization
file in pdf format that matches the windows of multiscale predictions.
show_tracks : bool, optional
Default is False. If True, generate chromatin tracks visualization
file in pdf format that matches the windows of multiscale predictions.
window_radius : int, optional
Default is 16000000. The acceptable values are 16000000 which selects
the 1-32Mb models or 128000000 which selects the 32-256Mb models.
padding_chr : str, optional
Default is "chr1". If window_radius is 128000000, padding is generally
needed to fill the sequence to 256Mb. The padding sequence will be
extracted from the padding_chr.
use_cuda : bool, optional
Default is True. Use CPU if False.
Returns
-------
outputs_ref_1, outputs_ref_2, outputs_alt : dict, dict, dict
Reference allele predictions zooming into the chr1 breakpoint,
Reference allele predictions zooming into the chr2 breakpoint,
Alternative allele prediction zooming into the junction.
The returned results are in the format of dictonaries
containing the prediction outputs and other
retrieved information. These dictionaries can be directly used as
input to genomeplot or genomeplot_256Mb. See documentation of `genomepredict` or `genomepredict_256Mb` for
details of the dictionary content.
"""
if custom_models is None:
if window_radius == 16000000:
models = ["h1esc", "hff"]
elif window_radius == 128000000:
models = ["h1esc_256m", "hff_256m"]
else:
raise ValueError(
"Only window_radius 16000000 (32Mb models) or 128000000 (256Mb models) are supported"
)
else:
models = custom_models
if target:
try:
if target == True:
if window_radius == 16000000:
target = ["h1esc", "hff"]
elif window_radius == 128000000:
target = ["h1esc_256m", "hff_256m"]
target = [t if isinstance(t, Genomic2DFeatures) else target_dict_global[t] for t in target]
except KeyError:
target = False
chrlen1 = [l for c, l in genome.get_chr_lens() if c == chr1].pop()
# ref.l
if window_radius == 16000000:
wpos = coord_clip(pos1, chrlen1)
sequence = genome.get_encoding_from_coords(
chr1, wpos - window_radius, wpos + window_radius
)[None, :]
if target:
targets = [
torch.FloatTensor(
t.get_feature_data(
chr1, coord_round(wpos - window_radius), coord_round(wpos + window_radius),
)[None, :]
) for t in target
]
else:
targets = None
elif window_radius == 128000000:
chrlen1_round = chrlen1 - chrlen1 % 32000
wpos = 128000000
if target:
sequence, normmats, targets = _retrieve_multi(
[[chr1, 0, chrlen1_round, "+"], [padding_chr, 0, 256000000 - chrlen1_round, "+"]],
genome,
target=target,
)
else:
sequence, normmats = _retrieve_multi(
[[chr1, 0, chrlen1_round, "+"], [padding_chr, 0, 256000000 - chrlen1_round, "+"]],
genome,
target=target,
)
targets = None
else:
raise ValueError(
"Only window_radius 16000000 (32Mb models) or 128000000 (256Mb models) are supported"
)
anno_scaled = process_anno(
[[pos1, "single"]], base=wpos - window_radius, window_radius=window_radius
)
if window_radius == 128000000:
outputs_ref_1 = genomepredict_256Mb(
sequence,
chr1,
normmats,
chrlen1_round,
pos1,
wpos,
models=models,
annotation=anno_scaled,
padding_chr=padding_chr,
targets=targets,
use_cuda=use_cuda,
)
else:
outputs_ref_1 = genomepredict(
sequence, chr1, pos1, wpos, models=models, annotation=anno_scaled, targets=targets, use_cuda=use_cuda,
)
if file is not None:
if window_radius == 128000000:
genomeplot_256Mb(
outputs_ref_1, show_coordinates=True, file=file + ".ref.1.256m.pdf",
)
else:
genomeplot(
outputs_ref_1,
show_genes=show_genes,
show_tracks=show_tracks,
show_coordinates=True,
file=file + ".ref.1.pdf",
colorbar=True,
)
chrlen2 = [l for c, l in genome.get_chr_lens() if c == chr2].pop()
if window_radius == 16000000:
wpos = coord_clip(pos2, chrlen2)
sequence = genome.get_encoding_from_coords(
chr2, wpos - window_radius, wpos + window_radius
)[None, :]
if target:
targets = [
torch.FloatTensor(
t.get_feature_data(
chr2, coord_round(wpos - window_radius), coord_round(wpos + window_radius),
)[None, :]
) for t in target
]
else:
targets = None
elif window_radius == 128000000:
chrlen2_round = chrlen2 - chrlen2 % 32000
wpos = 128000000
if target:
sequence, normmats, targets = _retrieve_multi(
[[chr2, 0, chrlen2_round, "+"], [padding_chr, 0, 256000000 - chrlen2_round, "+"]],
genome,
target=target,
)
else:
sequence, normmats = _retrieve_multi(
[[chr2, 0, chrlen2_round, "+"], [padding_chr, 0, 256000000 - chrlen2_round, "+"]],
genome,
target=target,
)
targets = None
anno_scaled = process_anno(
[[pos2, "single"]], base=wpos - window_radius, window_radius=window_radius
)
if window_radius == 128000000:
outputs_ref_2 = genomepredict_256Mb(
sequence,
chr2,
normmats,
chrlen2_round,
pos2,
wpos,
models=models,
annotation=anno_scaled,
padding_chr=padding_chr,
targets=targets,
use_cuda=use_cuda,
)
else:
outputs_ref_2 = genomepredict(
sequence, chr2, pos2, wpos, models=models, annotation=anno_scaled, targets=targets, use_cuda=use_cuda,
)
if file is not None:
if window_radius == 128000000:
genomeplot_256Mb(
outputs_ref_2, show_coordinates=True, file=file + ".ref.2.256m.pdf",
)
else:
genomeplot(
outputs_ref_2,
show_genes=show_genes,
show_tracks=show_tracks,
show_coordinates=True,
file=file + ".ref.2.pdf",
colorbar=True,
)
chrlen = [l for c, l in genome.get_chr_lens() if c == chr1].pop()
s = StructuralChange2(chr1, chrlen)
if orientation1 == "+":
s.delete(pos1, chrlen)
else:
s.delete(0, pos1 - 1)
s.invert(0, chrlen - pos1 + 1)
chrlen = [l for c, l in genome.get_chr_lens() if c == chr2].pop()
s2 = StructuralChange2(chr2, chrlen)
if orientation2 == "-":
s2.delete(0, pos2 - 1)
else:
s2.delete(pos2, chrlen)
s2.invert(0, pos2)
breakpos = s.coord_points[-1]
s = s + s2
if window_radius == 16000000:
wpos = coord_clip(breakpos, s.coord_points[-1])
sequence = []
curpos = 0
anno = []
for chrm, start, end, strand in s[wpos - window_radius : wpos + window_radius]:
seq = genome.get_encoding_from_coords(chrm, start, end)
if strand == "-":
seq = seq[None, ::-1, ::-1]
else:
seq = seq[None, :, :]
sequence.append(seq)
anno.append([curpos, curpos + end - start])
curpos = curpos + end - start
sequence = np.concatenate(sequence, axis=1)
else:
chrlen_alt_round = s.coord_points[-1] - s.coord_points[-1] % 32000
if chrlen_alt_round < 256000000:
wpos = 128000000
(sequence, normmats) = _retrieve_multi(
list(s[0:chrlen_alt_round]) + [[padding_chr, 0, 256000000 - chrlen_alt_round, "+"]],
genome,
target=False,
normmat=True,
normmat_regionlist=[
[chr1 + "|" + chr2, 0, chrlen_alt_round, "+"],
[padding_chr, 0, 256000000 - chrlen_alt_round, "+"],
],
)
curpos = 0
anno = []
for chrm, start, end, strand in s[0:chrlen_alt_round]:
anno.append([curpos, curpos + end - start])
curpos = curpos + end - start
else:
wpos = coord_clip(breakpos, chrlen_alt_round, window_radius=128000000)
(sequence, normmats) = _retrieve_multi(
list(s[wpos - window_radius : wpos + window_radius]),
genome,
target=False,
normmat=True,
normmat_regionlist=[
[chr1 + "|" + chr2, wpos - window_radius, wpos + window_radius, "+"]
],
)
curpos = 0
anno = []
for chrm, start, end, strand in s[wpos - window_radius : wpos + window_radius]:
anno.append([curpos, curpos + end - start])
curpos = curpos + end - start
anno_scaled = process_anno([[anno[0][-1], "double"]], base=0, window_radius=window_radius)
if window_radius == 16000000:
outputs_alt = genomepredict(
sequence, chr1 + "|" + chr2, breakpos, wpos, models=models, annotation=anno_scaled, use_cuda=use_cuda
)
if file is not None:
genomeplot(outputs_alt, show_coordinates=False, file=file + ".alt.pdf", colorbar=True)
else:
outputs_alt = genomepredict_256Mb(
sequence,
chr1 + "|" + chr2,
normmats,
chrlen_alt_round,
breakpos,
wpos,
models=models,
annotation=anno_scaled,
padding_chr=padding_chr,
use_cuda=use_cuda,
)
if file is not None:
genomeplot_256Mb(
outputs_alt, show_coordinates=True, file=file + ".alt.256m.pdf",
)
return outputs_ref_1, outputs_ref_2, outputs_alt
if __name__ == "__main__":
from docopt import docopt
import sys
import os
doc = """
Orca multiscale genome interaction sequence model prediction tool.
Usage:
orca_predict region [options] <coordinate> <output_dir>
orca_predict del [options] <coordinate> <output_dir>
orca_predict dup [options] <coordinate> <output_dir>
orca_predict inv [options] <coordinate> <output_dir>
orca_predict break [options] <coordinate> <output_dir>
Options:
-h --help Show this screen.
--show_genes Show gene annotation (only supported for 32Mb models).
--show_tracks Show chromatin tracks (only supported for 32Mb models).
--256m Use 256Mb models (default is 32Mb).
--nocuda Use CPU implementation.
--version Show version.
"""
if len(sys.argv) == 1:
sys.argv.append("-h")
arguments = docopt(doc, version="Orca v0.1")
show_genes = arguments["--show_genes"]
show_tracks = arguments["--show_tracks"]
window_radius = 128000000 if arguments["--256m"] else 16000000
use_cuda = not arguments["--nocuda"]
load_resources(models=["32M"], use_cuda=use_cuda)
if arguments["region"]:
predtype = "region"
elif arguments["del"]:
predtype = "del"
elif arguments["dup"]:
predtype = "dup"
elif arguments["inv"]:
predtype = "inv"
elif arguments["break"]:
predtype = "break"
def predict(chrm, start, end, savedir):
if not os.path.exists(savedir):
os.makedirs(savedir)
with torch.no_grad():
outputs = process_region(
chrm,
start,
end,
hg38,
target=target_available,
file=savedir + "/orca_pred",
show_genes=show_genes,
show_tracks=show_tracks,
window_radius=window_radius,
padding_chr="chr1",
use_cuda=use_cuda,
)
torch.save(outputs, savedir + "/orca_pred.pth")
return None
def get_interactions(predtype, content, savedir):
if predtype == "region":
pdf_names = ["orca_pred.pdf"]
if show_genes or show_tracks:
pdf_names += ["orca_pred.anno.pdf"]
chrstr, coordstr = str(content).split(":")
chrstr = "chr" + chrstr.replace("chr", "")
coord_s, coord_e = coordstr.split("-")
predict(chrstr, int(coord_s), int(coord_e), savedir)
elif predtype in ["dup", "del"]:
pdf_names = ["orca_pred.ref.l.pdf", "orca_pred.ref.r.pdf", "orca_pred.alt.pdf"]
if show_genes or show_tracks:
pdf_names += [
"orca_pred.ref.l.anno.pdf",
"orca_pred.ref.r.anno.pdf",
"orca_pred.alt.anno.pdf",
]
chrstr, coordstr = str(content).split(":")
chrstr = "chr" + chrstr.replace("chr", "")
coord_s, coord_e = coordstr.split("-")
if not os.path.exists(savedir):
os.makedirs(savedir)
if predtype == "dup":
outputs_ref_l, outputs_ref_r, outputs_alt = process_dup(
chrstr,
int(coord_s),
int(coord_e),
hg38,
target=target_available,
show_genes=show_genes,
show_tracks=show_tracks,
file=savedir + "/orca_pred",
window_radius=window_radius,
use_cuda=use_cuda,
)
else:
outputs_ref_l, outputs_ref_r, outputs_alt = process_del(
chrstr,
int(coord_s),
int(coord_e),
hg38,
target=target_available,
show_genes=show_genes,
show_tracks=show_tracks,
file=savedir + "/orca_pred",
window_radius=window_radius,
use_cuda=use_cuda,
)
torch.save(
{
"outputs_ref_l": outputs_ref_l,
"outputs_ref_r": outputs_ref_r,
"outputs_alt": outputs_alt,
},
savedir + "/orca_pred.pth",
)
elif predtype == "inv":
pdf_names = [
"orca_pred.ref.l.pdf",
"orca_pred.ref.r.pdf",
"orca_pred.alt.l.pdf",
"orca_pred.alt.r.pdf",
]
if show_genes or show_tracks:
pdf_names += [
"orca_pred.ref.l.anno.pdf",
"orca_pred.ref.r.anno.pdf",
"orca_pred.alt.l.anno.pdf",
"orca_pred.alt.r.anno.pdf",
]
chrstr, coordstr = str(content).split(":")
chrstr = "chr" + chrstr.replace("chr", "")
coord_s, coord_e = coordstr.split("-")
if not os.path.exists(savedir):
os.makedirs(savedir)
outputs_ref_l, outputs_ref_r, outputs_alt_l, outputs_alt_r = process_inv(
chrstr,
int(coord_s),
int(coord_e),
hg38,
target=target_available,
show_genes=show_genes,
show_tracks=show_tracks,
file=savedir + "/orca_pred",
window_radius=window_radius,
use_cuda=use_cuda,
)
torch.save(
{
"outputs_ref_l": outputs_ref_l,
"outputs_ref_r": outputs_ref_r,
"outputs_alt_l": outputs_alt_l,
"outputs_alt_r": outputs_alt_r,
},
savedir + "/orca_pred.pth",
)
elif predtype == "break":
pdf_names = ["orca_pred.ref.1.pdf", "orca_pred.ref.2.pdf", "orca_pred.alt.pdf"]
if show_genes or show_tracks:
pdf_names += [
"orca_pred.ref.1.anno.pdf",
"orca_pred.ref.2.anno.pdf",
"orca_pred.alt.anno.pdf",
]
chr_coord_1, chr_coord_2, orientations = str(content.replace("\t", " ")).split(" ")
chr1, coord1 = chr_coord_1.split(":")
chr2, coord2 = chr_coord_2.split(":")
chr1 = "chr" + chr1.replace("chr", "")
chr2 = "chr" + chr2.replace("chr", "")
orientation1, orientation2 = orientations.split("/")
if not os.path.exists(savedir):
os.makedirs(savedir)
outputs_ref_1, outputs_ref_2, outputs_alt = process_single_breakpoint(
chr1,
int(coord1),
chr2,
int(coord2),
orientation1,
orientation2,
hg38,
target=target_available,
show_genes=show_genes,
show_tracks=show_tracks,
file=savedir + "/orca_pred",
window_radius=window_radius,
use_cuda=use_cuda,
)
torch.save(
{
"outputs_ref_1": outputs_ref_1,
"outputs_ref_2": outputs_ref_2,
"outputs_alt": outputs_alt,
},
savedir + "/orca_pred.pth",
)
else:
raise ValueError("Unexpected prediction type!")
return None
get_interactions(predtype, arguments["<coordinate>"], arguments["<output_dir>"])