Source code for orca_predict

"""
This module provides functions for using Orca models for various
types of the predictions. This is the main module that you need for 
interacting with Orca models.

To use any of the prediction functions, `load_resources` has to be
called first to load the necessary resources.

The coordinates used in Orca are 0-based, inclusive for the start
coordinate and exclusive for the end coordinate, consistent with
python conventions. 
"""
import os
import pathlib

import numpy as np
import torch
from scipy.stats import spearmanr

from selene_utils2 import MemmapGenome, Genomic2DFeatures
import selene_sdk
from selene_sdk.sequences import Genome

from orca_models import H1esc, Hff, H1esc_1M, Hff_1M, H1esc_256M, Hff_256M
from orca_utils import (
    genomeplot,
    genomeplot_256Mb,
    StructuralChange2,
    process_anno,
    coord_round,
    coord_clip,
)

ORCA_PATH = str(pathlib.Path(__file__).parent.absolute())
model_dict_global, target_dict_global = {}, {}

[docs]def load_resources(models=["32M"], use_cuda=True, use_memmapgenome=True):
    """
    Load resources for Orca predictions including the specified 
    Orca models and hg38 reference genome. It also creates Genomic2DFeatures
    objects for experimental micro-C  datasets (for comparison with prediction).
    Load resourced are accessible as global variables.
    
    The list of globl variables generated is here:

    Global Variables
    ----------------
    hg38 : selene_utils2.MemmapGenome or selene_sdk.sequences.Genome
        If `use_memmapgenome==True` and the resource file for hg38 mmap exists,
        use MemmapGenome instead of Genome.
    h1esc : orca_models.H1esc
        1-32Mb Orca H1-ESC model
    hff : orca_models.Hff
        1-32Mb Orca HFF model
    h1esc_256m : orca_models.H1esc_256M
        32-256Mb Orca H1-ESC model
    hff_256m : orca_models.Hff_256M
        32-256Mb Orca HFF model
    h1esc_1m : orca_models.H1esc_1M
        1Mb Orca H1-ESC model
    hff_1m : orca_models.Hff_1M
        1Mb Orca HFF model
    target_h1esc : selene_utils2.Genomic2DFeatures
        Genomic2DFeatures object that load H1-ESC micro-C dataset 4DNFI9GMP2J8
        at 4kb resolution, used for comparison with 1-32Mb models.
    target_hff : selene_utils2.Genomic2DFeatures
        Genomic2DFeatures object that load HFF micro-C dataset 4DNFI643OYP9
        at 4kb resolution, used for comparison with 1-32Mb models.
    target_h1esc_256m : selene_utils2.Genomic2DFeatures
        Genomic2DFeatures object that load H1-ESC micro-C dataset 4DNFI9GMP2J8
        at 32kb resolution, used for comparison with 32-256Mb models.
    target_hff_256m : selene_utils2.Genomic2DFeatures
        Genomic2DFeatures object that load HFF micro-C dataset 4DNFI643OYP9
        at 32kb resolution, used for comparison with 32-256Mb models.
    target_h1esc_1m : selene_utils2.Genomic2DFeatures
        Genomic2DFeatures object that load H1-ESC micro-C dataset 4DNFI9GMP2J8
        at 32kb resolution, used for comparison with 1Mb models.
    target_hff_1m : selene_utils2.Genomic2DFeatures
        Genomic2DFeatures object that load HFF micro-C dataset 4DNFI643OYP9
        at 1kb resolution, used for comparison with 1Mb models.
    target_available : bool
        Indicate whether the micro-C dataset resource file is available.

    Parameters
    ----------
    models : list(str)
        List of model types to load, supported model types includes
        "32M", "256M", "1M", corresponding to 1-32Mb, 32-256Mb, and 1Mb
        models. Lower cases are also accepted.  
    use_cuda : bool, optional
        Default is True. If true, loaded models are moved to GPU.
    use_memmapgenome : bool, optional
        Default is True. If True and the resource file for hg38 mmap exists,
        use MemmapGenome instead of Genome.

 
"""
    global hg38, target_hff, target_h1esc, target_hff_256m, target_h1esc_256m, target_hff_1m, target_h1esc_1m, target_available

    if "32M" or "32m" in models:
        global h1esc, hff
        h1esc = H1esc()
        h1esc.eval()
        hff = Hff()
        hff.eval()
        if use_cuda:
            h1esc.cuda()
            hff.cuda()
        else:
            h1esc.cpu()
            hff.cpu()
        model_dict_global["h1esc"] = h1esc
        model_dict_global["hff"] = hff

    if "1M" or "1m" in models:
        global h1esc_1m, hff_1m
        h1esc_1m = H1esc_1M()
        h1esc_1m.eval()
        hff_1m = Hff_1M()
        hff_1m.eval()
        if use_cuda:
            h1esc_1m.cuda()
            hff_1m.cuda()
        else:
            h1esc_1m.cpu()
            hff_1m.cpu()
        model_dict_global["h1esc_1m"] = h1esc_1m
        model_dict_global["hff_1m"] = hff_1m

    if "256M" or "256m" in models:
        global h1esc_256m, hff_256m
        h1esc_256m = H1esc_256M()
        h1esc_256m.eval()
        hff_256m = Hff_256M()
        hff_256m.eval()
        if use_cuda:
            h1esc_256m.cuda()
            hff_256m.cuda()
        else:
            h1esc_256m.cpu()
            hff_256m.cpu()
        model_dict_global["h1esc_256m"] = h1esc_256m
        model_dict_global["hff_256m"] = hff_256m

    if (
        use_memmapgenome
        and pathlib.Path("/resources/Homo_sapiens.GRCh38.dna.primary_assembly.fa.mmap").exists()
    ):
        hg38 = MemmapGenome(
            input_path=ORCA_PATH + "/resources/Homo_sapiens.GRCh38.dna.primary_assembly.fa",
            memmapfile=ORCA_PATH + "/resources/Homo_sapiens.GRCh38.dna.primary_assembly.fa.mmap",
        )
    else:
        hg38 = Genome(
            input_path=ORCA_PATH + "/resources/Homo_sapiens.GRCh38.dna.primary_assembly.fa",
        )

    target_available = True
    if os.path.exists(ORCA_PATH + "/resources/4DNFI643OYP9.rebinned.mcool"):
        target_hff = Genomic2DFeatures(
            [ORCA_PATH + "/resources/4DNFI643OYP9.rebinned.mcool::/resolutions/4000"],
            ["r4000"],
            (8000, 8000),
            cg=True,
        )
        target_hff_256m = Genomic2DFeatures(
            [ORCA_PATH + "/resources/4DNFI643OYP9.rebinned.mcool::/resolutions/32000"],
            ["r32000"],
            (8000, 8000),
            cg=True,
        )
        target_hff_1m = Genomic2DFeatures(
            [ORCA_PATH + "/resources/4DNFI643OYP9.rebinned.mcool::/resolutions/1000"],
            ["r1000"],
            (8000, 8000),
            cg=True,
        )
        target_dict_global['hff'] = target_hff
        target_dict_global['hff_256m'] = target_hff_256m
        target_dict_global['hff_1m'] = target_hff_1m
    else:
        target_available = False

    if os.path.exists(ORCA_PATH + "/resources/4DNFI9GMP2J8.rebinned.mcool"):
        target_h1esc = Genomic2DFeatures(
            [ORCA_PATH + "/resources/4DNFI9GMP2J8.rebinned.mcool::/resolutions/4000"],
            ["r4000"],
            (8000, 8000),
            cg=True,
        )

        target_h1esc_256m = Genomic2DFeatures(
            [ORCA_PATH + "/resources/4DNFI9GMP2J8.rebinned.mcool::/resolutions/32000"],
            ["r32000"],
            (8000, 8000),
            cg=True,
        )

        target_h1esc_1m = Genomic2DFeatures(
            [ORCA_PATH + "/resources/4DNFI9GMP2J8.rebinned.mcool::/resolutions/1000"],
            ["r1000"],
            (8000, 8000),
            cg=True,
        )
        target_dict_global['h1esc'] = target_h1esc
        target_dict_global['h1esc_256m'] = target_h1esc_256m
        target_dict_global['h1esc_1m'] = target_h1esc_1m
    else:
        target_available = False


[docs]def genomepredict(
    sequence, mchr, mpos=-1, wpos=-1, models=["h1esc", "hff"], targets=None, annotation=None, use_cuda=True, nan_thresh=1,
):
    """Multiscale prediction for a 32Mb sequence
    input, zooming into the position specified when generating a series
    of 32Mb, 16Mb, 8Mb, 4Mb, 2Mb and 1Mb predictions with increasing
    resolutions (up to 4kb). This function also processes 
    information used only for plotting including targets and annotation.

    For larger sequence and interchromosomal predictions, you can use 
    256Mb input with genomepredict_256Mb.

    Parameters
    ----------
    sequence : numpy.ndarray
        One-hot sequence encoding of shape 1 x 4 x 32000000. 
        The encoding can be generated with `selene_sdk.Genome.sequence_to_encoding()`.
    mchr : str
        Chromosome name. This is used for annotation purpose only.
    mpos : int, optional
        The coordinate to zoom into for multiscale prediction.
    wpos : int, optional
        The coordinate of the center position of the sequence, which is
        start position + 16000000.
    models : list(torch.nn.Module or str), optional
        Models to use. Default is H1-ESC and HFF Orca models.
    targets : list(numpy.ndarray), optional
        The observed balanced contact matrices from the
        32Mb region. Used only for plotting when used with genomeplot. The length and
        order of the list of targets should match the models specified (default is 
        H1-ESC and HFF Orca models).
        The dimensions of the arrays should be 8000 x 8000 (1kb resolution).
    annotation : str or None, optional
        List of annotations for plotting. The annotation can be generated with
        See orca_utils.process_anno and see its documentation for more details.
    use_cuda : bool, optional
        Default is True. If False, use CPU.
    nan_thresh : int, optional
        Default is 1. Specify the threshold of the proportion of NaNs values 
        allowed during downsampling for the observed matrices. Only relevant for plotting. 
        The lower resolution observed matrix value are computed by averaging multiple 
        bins into one. By default, we allow missing values and only average over the 
        non-missing values, and the values with more than the specified proprotion 
        of missing values will be filled with NaN.
        
    Returns
    ----------
    output : dict
        Result dictionary that can be used as input for genomeplot. The dictionary
        has the following keys:
            - predictions : list(list(numpy.ndarray), list(numpy.ndarray))
                Multi-level predictions for H1-ESC and HFF cell types.
            - experiments : list(list(numpy.ndarray), list(numpy.ndarray))
                Observations for H1-ESC and HFF cell types that matches the predictions.
                Exists if `targets` is specified.
            - normmats : list(list(numpy.ndarray), list(numpy.ndarray))
                Background distance-based expected balanced contact matrices for 
                H1-ESC and HFF cell types that matches the predictions.
            - start_coords : list(int)
                Start coordinates for the prediction at each level.
            - end_coords : list(int)
                End coordinates for the prediction at each level.
            - chr : str
                The chromosome name.
            - annos : list(list(...))
                Annotation information. The format is as outputed by orca_utils.process_anno
                Exists if `annotation` is specified.


    """
    model_objs = []
    for m in models:
        if isinstance(m, torch.nn.Module):
            model_objs.append(m)
        else:
            try:
                if m in model_dict_global:
                    model_objs.append(model_dict_global[m])
            except KeyError:
                load_resources(models=["32M"], use_cuda=use_cuda)
                if m in model_dict_global:
                    model_objs.append(model_dict_global[m])
    models = model_objs
    n_models = len(models)

    with torch.no_grad():
        allpreds = []
        allstarts = []
        if targets:
            alltargets = []
        if annotation is not None:
            allannos = []

        for iii, seq in enumerate(
            [
                torch.FloatTensor(sequence),
                torch.FloatTensor(np.ascontiguousarray(sequence[:, ::-1, ::-1])),
            ]
        ):
            for ii, model in enumerate(models):
                if targets and iii == 0:
                    target = targets[ii]
                (encoding1, encoding2, encoding4, encoding8, encoding16, encoding32,) = model.net(
                    model.net0(torch.Tensor(seq.float()).transpose(1, 2).cuda())
                    if use_cuda
                    else model.net0(torch.Tensor(seq.float()).transpose(1, 2))
                )

                encodings = {
                    1: encoding1,
                    2: encoding2,
                    4: encoding4,
                    8: encoding8,
                    16: encoding16,
                    32: encoding32,
                }

                def eval_step(level, start, coarse_pred=None):
                    distenc = torch.log(
                        torch.FloatTensor(model.normmats[level][None, None, :, :]).cuda()
                        if use_cuda
                        else torch.FloatTensor(model.normmats[level][None, None, :, :])
                    ).expand(sequence.shape[0], 1, 250, 250)
                    if coarse_pred is not None:
                        if level == 1:
                            pred = model.denets[level].forward(
                                encodings[level][
                                    :, :, int(start / level) : int(start / level) + 250
                                ],
                                distenc,
                                coarse_pred,
                            ) + model.denet_1_pt.forward(
                                encodings[level][
                                    :, :, int(start / level) : int(start / level) + 250
                                ]
                            )
                        else:
                            pred = model.denets[level].forward(
                                encodings[level][
                                    :, :, int(start / level) : int(start / level) + 250
                                ],
                                distenc,
                                coarse_pred,
                            )
                    else:
                        pred = model.denets[level].forward(
                            encodings[level][:, :, int(start / level) : int(start / level) + 250],
                            distenc,
                        )

                    return pred

                preds = []
                starts = [0]
                if targets and iii == 0:
                    ts = []
                if annotation is not None and iii == 0:
                    annos = []
                for j, level in enumerate([32, 16, 8, 4, 2, 1]):
                    if j == 0:
                        pred = eval_step(level, starts[j])
                    else:
                        pred = eval_step(
                            level,
                            starts[j],
                            preds[j - 1][
                                :,
                                :,
                                start_index : start_index + 125,
                                start_index : start_index + 125,
                            ],
                        )

                    if targets and iii == 0:
                        target_r = np.nanmean(
                            np.nanmean(
                                np.reshape(
                                    target[
                                        :,
                                        starts[j] : starts[j] + 250 * level,
                                        starts[j] : starts[j] + 250 * level,
                                    ].numpy(),
                                    (target.shape[0], 250, level, 250, level),
                                ),
                                axis=4,
                            ),
                            axis=2,
                        )
                        target_nan = np.mean(
                            np.mean(
                                np.isnan(
                                    np.reshape(
                                        target[
                                            :,
                                            starts[j] : starts[j] + 250 * level,
                                            starts[j] : starts[j] + 250 * level,
                                        ].numpy(),
                                        (target.shape[0], 250, level, 250, level),
                                    )
                                ),
                                axis=4,
                            ),
                            axis=2,
                        )
                        target_r[target_nan > nan_thresh] = np.nan
                        target_np = np.log(
                            (target_r + model.epss[level])
                            / (model.normmats[level] + model.epss[level])
                        )[0, 0:, 0:]
                        ts.append(target_np)

                    if annotation is not None and iii == 0:
                        newstart = starts[j] / 8000.0
                        newend = (starts[j] + 250 * level) / 8000.0
                        anno_r = []
                        for r in annotation:
                            if len(r) == 3:
                                if not (r[0] >= newend or r[1] <= newstart):
                                    anno_r.append(
                                        (
                                            np.fmax((r[0] - newstart) / (newend - newstart), 0,),
                                            np.fmin((r[1] - newstart) / (newend - newstart), 1,),
                                            r[2],
                                        )
                                    )
                            else:
                                if r[0] >= newstart and r[0] < newend:
                                    anno_r.append(((r[0] - newstart) / (newend - newstart), r[1]))
                        annos.append(anno_r)

                    if iii == 0:
                        start_index = int(
                            np.clip(
                                np.floor(
                                    (
                                        (mpos - level * 1000000 / 4)
                                        - (wpos - 16000000 + starts[j] * 4000)
                                    )
                                    / (4000 * level)
                                ),
                                0,
                                125,
                            )
                        )
                    else:
                        start_index = int(
                            np.clip(
                                np.ceil(
                                    (
                                        (wpos + 16000000 - starts[j] * 4000)
                                        - (mpos + level * 1000000 / 4)
                                    )
                                    / (4000 * level)
                                ),
                                0,
                                125,
                            )
                        )

                    starts.append(starts[j] + start_index * level)
                    preds.append(pred)

                allpreds.append(preds)
                if iii == 0:
                    if targets:
                        alltargets.append(ts)
                    if annotation is not None:
                        allannos.append(annos)
                    allstarts.append(starts[:-1])

    output = {}
    output["predictions"] = [[] for _ in range(n_models)]
    for i in range(n_models):
        for j in range(len(allpreds[i])):
            if allpreds[i][j].shape[1] == 1:
                output["predictions"][i].append(
                    allpreds[i][j].cpu().detach().numpy()[0, 0, :, :] * 0.5
                    + allpreds[i + n_models][j].cpu().detach().numpy()[0, 0, ::-1, ::-1] * 0.5
                )
            else:
                output["predictions"][i].append(
                    allpreds[i][j].cpu().detach().numpy()[0, :, :, :] * 0.5
                    + allpreds[i + n_models][j].cpu().detach().numpy()[0, :, ::-1, ::-1] * 0.5
                )
    if targets:
        output["experiments"] = alltargets
    else:
        output["experiments"] = None
    output["start_coords"] = [wpos - 16000000 + s * 4000 for s in allstarts[0]]
    output["end_coords"] = [
        int(output["start_coords"][ii] + 32000000 / 2 ** (ii)) for ii in range(6)
    ]
    output["chr"] = mchr
    if annotation is not None:
        output["annos"] = allannos[0]
    else:
        output["annos"] = None
    output["normmats"] = [
        [model.normmats[ii] for ii in [32, 16, 8, 4, 2, 1]] for model in models
    ]
    return output


[docs]def genomepredict_256Mb(
    sequence,
    mchr,
    normmats,
    chrlen,
    mpos=-1,
    wpos=-1,
    models=["h1esc_256m", "hff_256m"],
    targets=None,
    annotation=None,
    padding_chr=None,
    use_cuda=True,
    nan_thresh=1,
):
    """Multiscale prediction for a 256Mb sequence
    input, zooming into the position specified when generating a series
    of 256Mb, 128Mb, 64Mb, and 32Mb predictions with increasing
    resolutions (up to 128kb). This function also processes 
    information used only for plotting including targets and annotation.

    This function accepts multichromosal input sequence. Thus it needs an
    extra input `normmats` to encode the chromosomal information. See documentation
    for normmats argument for details.

    Parameters
    ----------
    sequence : numpy.ndarray
        One-hot sequence encoding of shape 1 x 4 x 256000000. 
        The encoding can be generated with `selene_sdk.Genome.sequence_to_encoding()`.
    mchr : str
        The chromosome name of the first chromosome included in the seqeunce. 
        This is used for annotation purpose only.
    normmats : list(numpy.ndarray)
        A list of distance-based background matrices for H1-ESC and HFF.The
        normmats contains arrays with dimensions 8000 x 8000  (32kb resolution). 
        Interchromosomal interactions are filled with the expected balanced contact
        score for interchromomsal interactions.
    chrlen : int
        The coordinate of the end of the first chromosome in the input, which is the 
        chromosome that will be zoomed into.
    mpos : int, optional
        Default is -1. The coordinate to zoom into for multiscale prediction. If neither
        `mpos` nor `wpos` are specified, it zooms into the center of the input by default.
    wpos : int, optional
        Default is -1. The coordinate of the center position of the sequence, which is
        start position + 16000000. If neither `mpos` nor `wpos` are specified, it zooms 
        into the center of the input by default.
    models : list(torch.nn.Module or str), optional
        Models to use. Default is H1-ESC(256Mb) and HFF(256Mb) Orca models.
    targets : list(numpy.ndarray), optional
        The observed balanced contact matrices from the 256Mb sequence. 
        Used only for plotting when used with genomeplot. The length and
        order of the list of targets should match the models specified (default is 
        H1-ESC and HFF Orca models). The dimensions of the arrays should be 
        8000 x 8000 (32kb resolution).
    annotation : str or None, optional
        Default is None. List of annotations for plotting. The annotation can be generated with
        See orca_utils.process_anno and see its documentation for more details.
    padding_chr : str, None, optional
        Default is None. Name of the padding chromosome after the first. Used for annotation
        only. TODO: be more flexible in the support for multiple chromosomes.
    use_cuda : bool, optional
        Default is True. If False, use CPU.
    nan_thresh : int, optional
        Default is 1. Specify the threshold of the proportion of NaNs values 
        allowed during downsampling for the observed matrices. Only relevant for plotting. 
        The lower resolution observed matrix value are computed by averaging multiple 
        bins into one. By default, we allow missing values and only average over the 
        non-missing values, and the values with more than the specified proprotion 
        of missing values will be filled with NaN.
        
    Returns
    ----------
    output : dict
        Result dictionary that can be used as input for genomeplot. The dictionary
        has the following keys:
            - predictions : list(list(numpy.ndarray), list(numpy.ndarray))
                Multi-level predictions for H1-ESC and HFF cell types.
            - experiments : list(list(numpy.ndarray), list(numpy.ndarray))
                Observations for H1-ESC and HFF cell types that matches the predictions.
                Exists if `targets` is specified.
            - normmats : list(list(numpy.ndarray), list(numpy.ndarray))
                Background distance-based expected balanced contact matrices for 
                H1-ESC and HFF cell types that matches the predictions.
            - start_coords : list(int)
                Start coordinates for the prediction at each level.
            - end_coords : list(int)
                End coordinates for the prediction at each level.
            - chr : str
                The chromosome name.
            - annos : list(list(...))
                Annotation information. The format is as outputed by orca_utils.process_anno
                Exists if `annotation` is specified.
    """
    model_objs = []
    for m in models:
        if isinstance(m, torch.nn.Module):
            model_objs.append(m)
        else:
            try:
                if m in model_dict_global:
                    model_objs.append(model_dict_global[m])
            except KeyError:
                load_resources(models=["256M"], use_cuda=use_cuda)
                if m in model_dict_global:
                    model_objs.append(model_dict_global[m])
    models = model_objs

    with torch.no_grad():
        allpreds = []
        allstarts = []
        allnormmats = []
        if targets:
            alltargets = []
        if annotation is not None:
            allannos = []

        for iii, seq in enumerate(
            [
                torch.FloatTensor(sequence),
                torch.FloatTensor(np.ascontiguousarray(sequence[:, ::-1, ::-1])),
            ]
        ):
            for ii, model in enumerate(models):
                normmat = normmats[ii]
                normmat_nan = np.isnan(normmat)
                if np.any(normmat_nan):
                    normmat[normmat_nan] = np.nanmin(normmat[~normmat_nan])
                if targets and iii == 0:
                    target = targets[ii]

                (encoding32, encoding64, encoding128, encoding256) = model.net(
                    model.net1(
                        model.net0(
                            torch.Tensor(seq.float()).transpose(1, 2).cuda()
                            if use_cuda
                            else torch.Tensor(seq.float()).transpose(1, 2)
                        )
                    )[-1]
                )

                encodings = {
                    32: encoding32,
                    64: encoding64,
                    128: encoding128,
                    256: encoding256,
                }

                def eval_step(level, start, coarse_pred=None):
                    distenc = torch.log(
                        torch.FloatTensor(ns[level][None, :, :]).cuda()
                        if use_cuda
                        else torch.FloatTensor(ns[level][None, :, :])
                    ).expand(sequence.shape[0], 1, 250, 250)
                    if coarse_pred is not None:
                        pred = model.denets[level].forward(
                            encodings[level][
                                :, :, int(start / (level // 8)) : int(start / (level // 8)) + 250,
                            ],
                            distenc if iii == 0 else torch.flip(distenc, [2, 3]),
                            coarse_pred,
                        )
                    else:
                        pred = model.denets[level].forward(
                            encodings[level][
                                :, :, int(start / (level // 8)) : int(start / (level // 8)) + 250,
                            ],
                            distenc if iii == 0 else torch.flip(distenc, [2, 3]),
                        )

                    return pred

                preds = []
                starts = [0]
                ns = {}
                if targets and iii == 0:
                    ts = []
                if annotation is not None and iii == 0:
                    annos = []
                for j, level in enumerate([256, 128, 64, 32]):
                    normmat_r = np.nanmean(
                        np.nanmean(
                            np.reshape(
                                normmat[
                                    starts[j] : starts[j] + 250 * level // 8,
                                    starts[j] : starts[j] + 250 * level // 8,
                                ],
                                (1, 250, level // 8, 250, level // 8),
                            ),
                            axis=4,
                        ),
                        axis=2,
                    )
                    ns[level] = normmat_r

                    if j == 0:
                        pred = eval_step(level, starts[j])
                    else:
                        pred = eval_step(
                            level,
                            starts[j],
                            preds[j - 1][
                                :,
                                :,
                                start_index : start_index + 125,
                                start_index : start_index + 125,
                            ],
                        )

                    if targets and iii == 0:
                        target_r = np.nanmean(
                            np.nanmean(
                                np.reshape(
                                    target[
                                        :,
                                        starts[j] : starts[j] + 250 * level // 8,
                                        starts[j] : starts[j] + 250 * level // 8,
                                    ].numpy(),
                                    (target.shape[0], 250, level // 8, 250, level // 8),
                                ),
                                axis=4,
                            ),
                            axis=2,
                        )
                        target_nan = np.mean(
                            np.mean(
                                np.isnan(
                                    np.reshape(
                                        target[
                                            :,
                                            starts[j] : starts[j] + 250 * level // 8,
                                            starts[j] : starts[j] + 250 * level // 8,
                                        ].numpy(),
                                        (target.shape[0], 250, level // 8, 250, level // 8,),
                                    )
                                ),
                                axis=4,
                            ),
                            axis=2,
                        )
                        target_r[target_nan > nan_thresh] = np.nan
                        eps = np.nanmin(normmat_r)
                        target_np = np.log((target_r + eps) / (normmat_r + eps))[0, 0:, 0:]
                        ts.append(target_np)

                    if annotation is not None and iii == 0:
                        newstart = starts[j] / 8000.0
                        newend = (starts[j] + 250 * level // 8) / 8000.0
                        anno_r = []
                        for r in annotation:
                            if len(r) == 3:
                                if not (r[0] >= newend or r[1] <= newstart):
                                    anno_r.append(
                                        (
                                            np.fmax((r[0] - newstart) / (newend - newstart), 0,),
                                            np.fmin((r[1] - newstart) / (newend - newstart), 1,),
                                            r[2],
                                        )
                                    )
                            else:
                                if r[0] >= newstart and r[0] < newend:
                                    anno_r.append(((r[0] - newstart) / (newend - newstart), r[1]))
                        annos.append(anno_r)

                    if iii == 0:
                        proposed_start = (mpos - level * 1000000 / 4) - (
                            wpos - 128000000 + starts[j] * 4000 * 8
                        )
                    else:
                        proposed_start = (mpos - level * 1000000 / 4) - (
                            wpos + 128000000 - starts[j] * 4000 * 8 - level * 1000000
                        )
                    if chrlen is not None:
                        bounds = [
                            0 - (wpos - 128000000),
                            chrlen - level * 1000000 / 2 - (wpos - 128000000),
                        ]
                        if bounds[0] < bounds[1]:
                            proposed_start = np.clip(proposed_start, bounds[0], bounds[1])
                        else:
                            proposed_start = bounds[0]

                    start_index = int(np.clip(np.floor(proposed_start / (4000 * level)), 0, 125,))
                    if iii != 0:
                        start_index = 250 - (start_index + 125)

                    starts.append(starts[j] + start_index * level // 8)
                    preds.append(pred)

                allpreds.append(preds)
                allnormmats.append(ns)
                if iii == 0:
                    if targets:
                        alltargets.append(ts)
                    if annotation is not None:
                        allannos.append(annos)
                    allstarts.append(starts[:-1])

    output = {}

    output["predictions"] = [[] for _ in range(n_models)]
    for i in range(n_models):
        for j in range(len(allpreds[i])):
            if allpreds[i][j].shape[1] == 1:
                output["predictions"][i].append(
                    allpreds[i][j].cpu().detach().numpy()[0, 0, :, :] * 0.5
                    + allpreds[i + n_models][j].cpu().detach().numpy()[0, 0, ::-1, ::-1] * 0.5
                )
            else:
                output["predictions"][i].append(
                    allpreds[i][j].cpu().detach().numpy()[0, :, :, :] * 0.5
                    + allpreds[i + n_models][j].cpu().detach().numpy()[0, :, ::-1, ::-1] * 0.5
                )
    if targets:
        output["experiments"] = alltargets
    else:
        output["experiments"] = None
    output["start_coords"] = [wpos - 128000000 + s * 32000 for s in allstarts[0]]
    output["end_coords"] = [
        np.fmin(int(output["start_coords"][ii] + 256000000 / 2 ** (ii)), chrlen) for ii in range(4)
    ]

    if annotation is not None:
        output["annos"] = allannos[0]
    else:
        output["annos"] = None
    output["chr"] = mchr
    output["padding_chr"] = padding_chr
    output["normmats"] = allnormmats
    return output


def _retrieve_multi(regionlist, genome, target=True, normmat=True, normmat_regionlist=None):
    sequences = []
    for region in regionlist:
        if len(region) == 4:
            chrom, start, end, strand = region
            sequences.append(genome.get_encoding_from_coords(chrom, start, end, strand))
        else:
            chrom, start, end = region
            sequences.append(genome.get_encoding_from_coords(chrom, start, end, "+"))

    sequence = np.vstack(sequences)[None, :, :]

    
    if isinstance(target, list):
        target_objs = target
        has_target = True
    elif target and target_available:
        target_objs = [target_h1esc_256m, target_hff_256m]
        has_target = True
    else:
        has_target = False

    if has_target:
        targets = []
        for target_obj in target_objs:
            targets_ = []
            for region in regionlist:
                if len(region) == 4:
                    chrom, start, end, strand = region
                else:
                    chrom, start, end = region
                    strand = "+"
                t = []
                for region2 in regionlist:
                    if len(region2) == 4:
                        chrom2, start2, end2, strand2 = region2
                    else:
                        chrom2, start2, end2 = region2
                        strand = "+"
                    t.append(
                        target_obj.get_feature_data(
                            chrom, start, end, chrom2=chrom2, start2=start2, end2=end2
                        )
                    )
                    if strand == "-":
                        t[-1] = t[-1][::-1, :]
                    if strand2 == "-":
                        t[-1] = t[-1][:, ::-1]
                targets_.append(t)
            targets_= np.vstack([np.hstack(l) for l in targets_])
            targets.append(targets_)
        targets = [
            torch.FloatTensor(l[None, :, :]) for l in targets
        ]

    if normmat:
        if isinstance(normmat, list):
            normmat_objs = normmat
        else:
            normmat_objs = [h1esc_256m, hff_256m]
        
        if normmat_regionlist is None:
            normmat_regionlist = regionlist

        normmats = []
        for normmat_obj in normmat_objs:
            normmats_ = []
            for chrom, start, end, strand in normmat_regionlist:
                b = []
                for chrom2, start2, end2, strand2 in normmat_regionlist:
                    if chrom2 != chrom:
                        b.append(
                            np.full(
                                (int((end - start) / 32000), int((end2 - start2) / 32000)),
                                normmat_obj.background_trans,
                            )
                        )
                    else:
                        binsize = 32000
                        acoor = np.linspace(start, end, int((end - start) / 32000) + 1)[:-1]
                        bcoor = np.linspace(start2, end2, int((end2 - start2) / 32000) + 1)[:-1]
                        b.append(
                            normmat_obj.background_cis[
                                (np.abs(acoor[:, None] - bcoor[None, :]) / binsize).astype(int)
                            ]
                        )
                        if strand == "-":
                            b[-1] = b[-1][::-1, :]
                        if strand2 == "-":
                            b[-1] = b[-1][:, ::-1]
                normmats_.append(b)
            normmats_ = np.vstack([np.hstack(l) for l in normmats_])
            normmats.append(normmats_)

    datatuple = (sequence,)
    if normmat:
        datatuple = datatuple + (normmats,)
    if has_target:
        datatuple = datatuple + (targets,)
    return datatuple


[docs]def process_region(
    mchr,
    mstart,
    mend,
    genome,
    file=None,
    custom_models=None,
    target=True,
    show_genes=True,
    show_tracks=False,
    window_radius=16000000,
    padding_chr="chr1",
    use_cuda=True,
):
    """
    Generate multiscale genome interaction predictions for 
    the specified region. 

    Parameters
    ----------
    mchr : str
        The chromosome name of the first segment
    mstart : int
        The start coordinate of the region.
    mend : ind
        The end coordinate of the region.
    genome : selene_utils2.MemmapGenome or selene_sdk.sequences.Genome
        The reference genome object to extract sequence from
    custom_models : list(torch.nn.Module or str) or None, optional
        Models to use instead of the default H1-ESC and HFF Orca models.
        Default is None.
    target : list(selene_utils2.Genomic2DFeatures or str) or bool, optional
        If specified as list, use this list of targets to retrieve experimental
        data (for plotting only). Default is True and will use micro-C data 
        for H1-ESC and HFF cells (4DNFI9GMP2J8, 4DNFI643OYP9) that correspond
        to the default models.
    file : str or None, optional
        Default is None. The output file prefix.
    show_genes : bool, optional
        Default is True. If True, generate gene annotation visualization
        file in pdf format that matches the windows of multiscale predictions.
    show_tracks : bool, optional
        Default is False. If True, generate chromatin tracks visualization
        file in pdf format that matches the windows of multiscale predictions.
    window_radius : int, optional
        Default is 16000000. The acceptable values are 16000000 which selects
        the 1-32Mb models or 128000000 which selects the 32-256Mb models.
    padding_chr : str, optional
        Default is "chr1". If window_radius is 128000000, padding is generally 
        needed to fill the sequence to 256Mb. The padding sequence will be 
        extracted from the padding_chr.
    use_cuda : bool, optional
        Default is True. Use CPU if False.

    Returns
    -------
    outputs_ref_l, outputs_ref_r, outputs_alt : dict, dict, dict
        Reference allele predictions zooming into the left boundary of the
        duplication,
        Reference allele predictions zooming into the right boundary of the
        duplication,
        Alternative allele predictions zooming into the duplication breakpoint.
        The returned results are in the format of dictonaries 
        containing the prediction outputs and other 
        retrieved information. These dictionaries can be directly used as
        input to genomeplot or genomeplot_256Mb. See documentation of `genomepredict` or `genomepredict_256Mb` for
        details of the dictionary content.

    """
    chrlen = [l for c, l in genome.get_chr_lens() if c == mchr].pop()
    mpos = int((int(mstart) + int(mend)) / 2)

    if custom_models is None:
        if window_radius == 16000000:
            models = ["h1esc", "hff"]
        elif window_radius == 128000000:
            models = ["h1esc_256m", "hff_256m"]
        else:
            raise ValueError(
                "Only window_radius 16000000 (32Mb models) or 128000000 (256Mb models) are supported"
            )
    else:
        models = custom_models

    if target:
        try:
            if target == True:
                if window_radius == 16000000:
                    target = ["h1esc", "hff"]
                elif window_radius == 128000000:
                    target = ["h1esc_256m", "hff_256m"]
            target = [t if isinstance(t, Genomic2DFeatures) else target_dict_global[t] for t in target]
        except KeyError:
            target = False

    if window_radius == 16000000:
        wpos = coord_clip(mpos, chrlen)
        sequence = genome.get_encoding_from_coords(
            mchr, wpos - window_radius, wpos + window_radius
        )[None, :]
        if target:
            targets = [
                torch.FloatTensor(
                    t.get_feature_data(
                        mchr, coord_round(wpos - window_radius), coord_round(wpos + window_radius),
                    )[None, :]
                )
                for t in target
            ]
        else:
            targets = None
    elif window_radius == 128000000:
        chrlen_round = chrlen - chrlen % 32000
        wpos = 128000000
        if has_target:
            sequence, normmats, targets = _retrieve_multi(
                [[mchr, 0, chrlen_round, "+"], [padding_chr, 0, 256000000 - chrlen_round, "+"]],
                genome,
                target=target,
            )
        else:
            sequence, normmats = _retrieve_multi(
                [[mchr, 0, chrlen_round, "+"], [padding_chr, 0, 256000000 - chrlen_round, "+"]],
                genome,
                target=target,
            )
            targets = None
    else:
        raise ValueError(
            "Only window_radius 16000000 (32Mb models) or 128000000 (256Mb models) are supported"
        )

    if mstart - mend < 2 * window_radius:
        anno_scaled = process_anno(
            [
                [
                    np.clip(mstart, wpos - window_radius, wpos + window_radius),
                    np.clip(mend, wpos - window_radius, wpos + window_radius),
                    "black",
                ]
            ],
            base=wpos - window_radius,
            window_radius=window_radius,
        )
    else:
        anno_scaled = None

    if window_radius == 128000000:
        outputs_ref = genomepredict_256Mb(
            sequence,
            mchr,
            normmats,
            chrlen_round,
            mpos,
            wpos,
            models=models,
            annotation=anno_scaled,
            padding_chr=padding_chr,
            targets=targets,
            use_cuda=use_cuda,
        )
    else:
        outputs_ref = genomepredict(
            sequence, mchr, mpos, wpos, annotation=anno_scaled, models=models, targets=targets, use_cuda=use_cuda,
        )
    if file is not None:
        if window_radius == 128000000:
            genomeplot_256Mb(
                outputs_ref, show_coordinates=True, file=file + ".256m.pdf",
            )
        else:
            genomeplot(
                outputs_ref,
                show_genes=show_genes,
                show_tracks=show_tracks,
                show_coordinates=True,
                file=file + ".pdf",
            )
    return outputs_ref


[docs]def process_dup(
    mchr,
    mstart,
    mend,
    genome,
    file=None,
    custom_models=None,
    target=True,
    show_genes=True,
    show_tracks=False,
    window_radius=16000000,
    padding_chr="chr1",
    use_cuda=True,
):
    """
    Generate multiscale genome interaction predictions for 
    an duplication variant. 

    Parameters
    ----------
    mchr : str
        The chromosome name of the first segment
    mstart : int
        The start coordinate of the duplication.
    mend : ind
        The end coordinate of the duplication.
    genome : selene_utils2.MemmapGenome or selene_sdk.sequences.Genome
        The reference genome object to extract sequence from
    custom_models : list(torch.nn.Module or str) or None, optional
        Models to use instead of the default H1-ESC and HFF Orca models.
        Default is None.
    target : list(selene_utils2.Genomic2DFeatures or str) or bool, optional
        If specified as list, use this list of targets to retrieve experimental
        data (for plotting only). Default is True and will use micro-C data 
        for H1-ESC and HFF cells (4DNFI9GMP2J8, 4DNFI643OYP9) that correspond
        to the default models.
    file : str or None, optional
        Default is None. The output file prefix.
    show_genes : bool, optional
        Default is True. If True, generate gene annotation visualization
        file in pdf format that matches the windows of multiscale predictions.
    show_tracks : bool, optional
        Default is False. If True, generate chromatin tracks visualization
        file in pdf format that matches the windows of multiscale predictions.
    window_radius : int, optional
        Default is 16000000. The acceptable values are 16000000 which selects
        the 1-32Mb models or 128000000 which selects the 32-256Mb models.
    padding_chr : str, optional
        Default is "chr1". If window_radius is 128000000, padding is generally 
        needed to fill the sequence to 256Mb. The padding sequence will be 
        extracted from the padding_chr.
    use_cuda : bool, optional
        Default is True. Use CPU if False.

    Returns
    -------
    outputs_ref_l, outputs_ref_r, outputs_alt : dict, dict, dict
        Reference allele predictions zooming into the left boundary of the
        duplication,
        Reference allele predictions zooming into the right boundary of the
        duplication,
        Alternative allele predictions zooming into the duplication breakpoint.
        The returned results are in the format of dictonaries 
        containing the prediction outputs and other 
        retrieved information. These dictionaries can be directly used as
        input to genomeplot or genomeplot_256Mb. See documentation of `genomepredict` or `genomepredict_256Mb` for
        details of the dictionary content.
    """
    chrlen = [l for c, l in genome.get_chr_lens() if c == mchr].pop()

    if custom_models is None:
        if window_radius == 16000000:
            models = ["h1esc", "hff"]
        elif window_radius == 128000000:
            models = ["h1esc_256m", "hff_256m"]
        else:
            raise ValueError(
                "Only window_radius 16000000 (32Mb models) or 128000000 (256Mb models) are supported"
            )
    else:
        models = custom_models

    if target:
        try:
            if target == True:
                if window_radius == 16000000:
                    target = ["h1esc", "hff"]
                elif window_radius == 128000000:
                    target = ["h1esc_256m", "hff_256m"]
            target = [t if isinstance(t, Genomic2DFeatures) else target_dict_global[t] for t in target]
        except KeyError:
            target = False

    # ref.l
    if window_radius == 16000000:
        wpos = coord_clip(mstart, chrlen)
        sequence = genome.get_encoding_from_coords(
            mchr, wpos - window_radius, wpos + window_radius
        )[None, :]
        if target:
            targets = [
                torch.FloatTensor(
                    t.get_feature_data(
                        mchr, coord_round(wpos - window_radius), coord_round(wpos + window_radius),
                    )[None, :]
                ) for t in target
            ]
        else:
            targets = None
    elif window_radius == 128000000:
        chrlen_round = chrlen - chrlen % 32000
        wpos = 128000000
        if target:
            sequence, normmats, targets = _retrieve_multi(
                [[mchr, 0, chrlen_round, "+"], [padding_chr, 0, 256000000 - chrlen_round, "+"]],
                genome,
                target=target,
            )
        else:
            sequence, normmats = _retrieve_multi(
                [[mchr, 0, chrlen_round, "+"], [padding_chr, 0, 256000000 - chrlen_round, "+"]],
                genome,
                target=target,
            )
            targets = None
    else:
        raise ValueError(
            "Only window_radius 16000000 (32Mb models) or 128000000 (256Mb models) are supported"
        )

    if wpos + window_radius > mend:
        anno_scaled = process_anno(
            [[mstart, mend, "black"]], base=wpos - window_radius, window_radius=window_radius
        )
    else:
        anno_scaled = process_anno(
            [[mstart, wpos + window_radius, "black"]],
            base=wpos - window_radius,
            window_radius=window_radius,
        )

    if window_radius == 128000000:
        outputs_ref_l = genomepredict_256Mb(
            sequence,
            mchr,
            normmats,
            chrlen_round,
            mstart,
            wpos,
            annotation=anno_scaled,
            padding_chr=padding_chr,
            models=models,
            targets=targets,
            use_cuda=use_cuda,
        )
    else:
        outputs_ref_l = genomepredict(
            sequence,
            mchr,
            mstart,
            wpos,
            annotation=anno_scaled,
            models=models,
            targets=targets,
            use_cuda=use_cuda,
        )
    if file is not None:
        if window_radius == 128000000:
            genomeplot_256Mb(
                outputs_ref_l, show_coordinates=True, file=file + ".ref.l.256m.pdf",
            )
        else:
            genomeplot(
                outputs_ref_l,
                show_genes=show_genes,
                show_tracks=show_tracks,
                show_coordinates=True,
                file=file + ".ref.l.pdf",
            )

    # ref.r
    if window_radius == 16000000:
        wpos = coord_clip(mend, chrlen)
        sequence = genome.get_encoding_from_coords(
            mchr, wpos - window_radius, wpos + window_radius
        )[None, :]
        if target:
            targets = [
                torch.FloatTensor(
                    t.get_feature_data(
                        mchr, coord_round(wpos - window_radius), coord_round(wpos + window_radius),
                    )[None, :]
                ) for t in target
            ]
        else:
            targets = None

    if wpos - window_radius < mstart:
        anno_scaled = process_anno(
            [[mstart, mend, "black"]], base=wpos - window_radius, window_radius=window_radius
        )
    else:
        anno_scaled = process_anno(
            [[wpos - window_radius, mend, "black"]],
            base=wpos - window_radius,
            window_radius=window_radius,
        )

    if window_radius == 16000000:
        outputs_ref_r = genomepredict(
            sequence, mchr, mend, wpos, models=models, annotation=anno_scaled, targets=targets, use_cuda=use_cuda,
        )
        if file is not None:
            genomeplot(
                outputs_ref_r,
                show_genes=show_genes,
                show_tracks=show_tracks,
                show_coordinates=True,
                file=file + ".ref.r.pdf",
            )
    else:
        outputs_ref_r = genomepredict_256Mb(
            sequence,
            mchr,
            normmats,
            chrlen_round,
            mend,
            wpos,
            annotation=anno_scaled,
            padding_chr=padding_chr,
            models=models,
            targets=targets,
            use_cuda=use_cuda,
        )
        genomeplot_256Mb(
            outputs_ref_r, show_coordinates=True, file=file + ".ref.r.256m.pdf",
        )

    # alt (r)
    s = StructuralChange2(mchr, chrlen)
    s.duplicate(mstart, mend)
    chrlen_alt = chrlen + mend - mstart
    if window_radius == 16000000:
        wpos = coord_clip(mend, chrlen_alt)
        sequence = []
        for chrm, start, end, strand in s[wpos - window_radius : wpos + window_radius]:
            seq = genome.get_encoding_from_coords(chrm, start, end)
            if strand == "-":
                seq = seq[None, ::-1, ::-1]
            else:
                seq = seq[None, :, :]
            sequence.append(seq)
        sequence = np.concatenate(sequence, axis=1)
    else:
        chrlen_alt_round = chrlen_alt - chrlen_alt % 32000
        if chrlen_alt_round < 256000000:
            wpos = 128000000
            (sequence, normmats) = _retrieve_multi(
                list(s[0:chrlen_alt_round]) + [[padding_chr, 0, 256000000 - chrlen_alt_round, "+"]],
                genome,
                target=False,
                normmat=True,
                normmat_regionlist=[
                    [mchr, 0, chrlen_alt_round, "+"],
                    [padding_chr, 0, 256000000 - chrlen_alt_round, "+"],
                ],
            )
        else:
            wpos = coord_clip(mend, chrlen_alt_round, window_radius=128000000)
            (sequence, normmats) = _retrieve_multi(
                list(s[wpos - window_radius : wpos + window_radius]),
                genome,
                target=False,
                normmat=True,
                normmat_regionlist=[[mchr, wpos - window_radius, wpos + window_radius, "+"]],
            )

    if wpos - window_radius < mstart and mend + mend - mstart < wpos + window_radius:
        anno_scaled = process_anno(
            [[mstart, mend, "black"], [mend, mend + mend - mstart, "gray"]],
            base=wpos - window_radius,
            window_radius=window_radius,
        )
    elif wpos - window_radius >= mstart and mend + mend - mstart < wpos + window_radius:
        anno_scaled = process_anno(
            [[wpos - window_radius, mend, "black"], [mend, mend + mend - mstart, "gray"],],
            base=wpos - window_radius,
            window_radius=window_radius,
        )
    elif wpos - window_radius < mstart and mend + mend - mstart >= wpos + window_radius:
        anno_scaled = process_anno(
            [[mstart, mend, "black"], [mend, wpos + window_radius, "gray"]],
            base=wpos - window_radius,
            window_radius=window_radius,
        )
    else:
        anno_scaled = process_anno(
            [[wpos - window_radius, mend, "black"], [mend, wpos + window_radius, "gray"],],
            base=wpos - window_radius,
            window_radius=window_radius,
        )

    if window_radius == 16000000:
        outputs_alt = genomepredict(
            sequence, mchr, mend, wpos, models=models, annotation=anno_scaled, use_cuda=use_cuda
        )
        if file is not None:
            genomeplot(outputs_alt, show_coordinates=True, file=file + ".alt.pdf")
    else:
        outputs_alt = genomepredict_256Mb(
            sequence,
            mchr,
            normmats,
            chrlen_alt_round,
            mend,
            wpos,
            models=models,
            annotation=anno_scaled,
            padding_chr=padding_chr,
            use_cuda=use_cuda,
        )
        if file is not None:
            genomeplot_256Mb(
                outputs_alt, show_coordinates=True, file=file + ".alt.256m.pdf",
            )

    return outputs_ref_l, outputs_ref_r, outputs_alt


[docs]def process_del(
    mchr,
    mstart,
    mend,
    genome,
    cmap=None,
    file=None,
    custom_models=None,
    target=True,
    show_genes=True,
    show_tracks=False,
    window_radius=16000000,
    padding_chr="chr1",
    use_cuda=True,
):
    """
    Generate multiscale genome interaction predictions for 
    an deletion variant. 

    Parameters
    ----------
    mchr : str
        The chromosome name of the first segment
    mstart : int
        The start coordinate of the deletion.
    mend : ind
        The end coordinate of the deletion.
    genome : selene_utils2.MemmapGenome or selene_sdk.sequences.Genome
        The reference genome object to extract sequence from
    custom_models : list(torch.nn.Module or str) or None, optional
        Models to use instead of the default H1-ESC and HFF Orca models.
        Default is None.
    target : list(selene_utils2.Genomic2DFeatures or str) or bool, optional
        If specified as list, use this list of targets to retrieve experimental
        data (for plotting only). Default is True and will use micro-C data 
        for H1-ESC and HFF cells (4DNFI9GMP2J8, 4DNFI643OYP9) that correspond
        to the default models.
    file : str or None, optional
        Default is None. The output file prefix.
    show_genes : bool, optional
        Default is True. If True, generate gene annotation visualization
        file in pdf format that matches the windows of multiscale predictions.
    show_tracks : bool, optional
        Default is False. If True, generate chromatin tracks visualization
        file in pdf format that matches the windows of multiscale predictions.
    window_radius : int, optional
        Default is 16000000. The acceptable values are 16000000 which selects
        the 1-32Mb models or 128000000 which selects the 32-256Mb models.
    padding_chr : str, optional
        Default is "chr1". If window_radius is 128000000, padding is generally 
        needed to fill the sequence to 256Mb. The padding sequence will be 
        extracted from the padding_chr.
    use_cuda : bool, optional
        Default is True. Use CPU if False.

    Returns
    -------
    outputs_ref_l, outputs_ref_r, outputs_alt : dict, dict, dict
        Reference allele predictions zooming into the left boundary of the
        deletion,
        Reference allele predictions zooming into the right boundary of the
        deletion,
        Alternative allele predictions zooming into the deletion breakpoint.
        The returned results are in the format of dictonaries 
        containing the prediction outputs and other 
        retrieved information. These dictionaries can be directly used as
        input to genomeplot or genomeplot_256Mb. See documentation of `genomepredict` or `genomepredict_256Mb` for
        details of the dictionary content.
    """
    chrlen = [l for c, l in genome.get_chr_lens() if c == mchr].pop()

    if custom_models is None:
        if window_radius == 16000000:
            models = ["h1esc", "hff"]
        elif window_radius == 128000000:
            models = ["h1esc_256m", "hff_256m"]
        else:
            raise ValueError(
                "Only window_radius 16000000 (32Mb models) or 128000000 (256Mb models) are supported"
            )
    else:
        models = custom_models

    if target:
        try:
            if target == True:
                if window_radius == 16000000:
                    target = ["h1esc", "hff"]
                elif window_radius == 128000000:
                    target = ["h1esc_256m", "hff_256m"]
            target = [t if isinstance(t, Genomic2DFeatures) else target_dict_global[t] for t in target]
        except KeyError:
            target = False

    # ref.l
    if window_radius == 16000000:
        wpos = coord_clip(mstart, chrlen)
        sequence = genome.get_encoding_from_coords(
            mchr, wpos - window_radius, wpos + window_radius
        )[None, :]
        if target:
            targets = [
                torch.FloatTensor(
                    t.get_feature_data(
                        mchr, coord_round(wpos - window_radius), coord_round(wpos + window_radius),
                    )[None, :]
                ) for t in target
            ]
        else:
            targets = None
    elif window_radius == 128000000:
        chrlen_round = chrlen - chrlen % 32000
        wpos = 128000000
        if target:
            sequence, normmats, targets = _retrieve_multi(
                [[mchr, 0, chrlen_round, "+"], [padding_chr, 0, 256000000 - chrlen_round, "+"]],
                genome,
                target=target,
            )
        else:
            sequence, normmats = _retrieve_multi(
                [[mchr, 0, chrlen_round, "+"], [padding_chr, 0, 256000000 - chrlen_round, "+"]],
                genome,
                target=target,
            )
    else:
        raise ValueError(
            "Only window_radius 16000000 (32Mb models) or 128000000 (256Mb models) are supported"
        )

    if wpos + window_radius > mend:
        anno_scaled = process_anno(
            [[mstart, mend, "black"]], base=wpos - window_radius, window_radius=window_radius
        )
    else:
        anno_scaled = process_anno(
            [[mstart, wpos + window_radius, "black"]],
            base=wpos - window_radius,
            window_radius=window_radius,
        )

    if window_radius == 128000000:
        outputs_ref_l = genomepredict_256Mb(
            sequence,
            mchr,
            normmats,
            chrlen_round,
            mstart,
            wpos,
            models=models,
            annotation=anno_scaled,
            padding_chr=padding_chr,
            targets=targets,
            use_cuda=use_cuda,
        )
    else:
        outputs_ref_l = genomepredict(
            sequence,
            mchr,
            mstart,
            wpos,
            models=models,
            annotation=anno_scaled,
            targets=targets,
            use_cuda=use_cuda,
        )
    if file is not None:
        if window_radius == 128000000:
            genomeplot_256Mb(
                outputs_ref_l, show_coordinates=True, file=file + ".ref.l.256m.pdf",
            )
        else:
            genomeplot(
                outputs_ref_l,
                show_genes=show_genes,
                show_tracks=show_tracks,
                show_coordinates=True,
                cmap=cmap,
                file=file + ".ref.l.pdf",
            )

    # ref.r
    if window_radius == 16000000:
        wpos = coord_clip(mend, chrlen)
        sequence = genome.get_encoding_from_coords(
            mchr, wpos - window_radius, wpos + window_radius
        )[None, :]
        if target:
            targets = [
                torch.FloatTensor(
                    t.get_feature_data(
                        mchr, coord_round(wpos - window_radius), coord_round(wpos + window_radius),
                    )[None, :]
                ) for t in target
            ]
        else:
            targets = None

    if wpos - window_radius < mstart:
        anno_scaled = process_anno(
            [[mstart, mend, "black"]], base=wpos - window_radius, window_radius=window_radius
        )
    else:
        anno_scaled = process_anno(
            [[wpos - window_radius, mend, "black"]],
            base=wpos - window_radius,
            window_radius=window_radius,
        )

    if window_radius == 16000000:
        outputs_ref_r = genomepredict(
            sequence, mchr, mend, wpos, models=models, annotation=anno_scaled, targets=targets, use_cuda=use_cuda,
        )
        if file is not None:
            genomeplot(
                outputs_ref_r,
                show_genes=show_genes,
                show_tracks=show_tracks,
                show_coordinates=True,
                cmap=cmap,
                file=file + ".ref.r.pdf",
            )
    else:
        outputs_ref_r = genomepredict_256Mb(
            sequence,
            mchr,
            normmats,
            chrlen_round,
            mend,
            wpos,
            models=models,
            annotation=anno_scaled,
            padding_chr=padding_chr,
            targets=targets,
            use_cuda=use_cuda,
        )
        if file is not None:
            genomeplot_256Mb(
                outputs_ref_r, show_coordinates=True, file=file + ".ref.r.256m.pdf",
            )

    # alt
    s = StructuralChange2(mchr, chrlen)
    s.delete(mstart, mend)
    chrlen_alt = chrlen - (mend - mstart)
    if window_radius == 16000000:
        wpos = coord_clip(mstart, chrlen_alt)
        sequence = []
        for chrm, start, end, strand in s[wpos - window_radius : wpos + window_radius]:
            seq = genome.get_encoding_from_coords(chrm, start, end)
            if strand == "-":
                seq = seq[None, ::-1, ::-1]
            else:
                seq = seq[None, :, :]
            sequence.append(seq)
        sequence = np.concatenate(sequence, axis=1)
    else:
        chrlen_alt_round = chrlen_alt - chrlen_alt % 32000
        wpos = 128000000
        (sequence, normmats) = _retrieve_multi(
            list(s[0:chrlen_alt_round]) + [[padding_chr, 0, 256000000 - chrlen_alt_round, "+"]],
            genome,
            target=False,
            normmat=True,
            normmat_regionlist=[
                [mchr, 0, chrlen_alt_round, "+"],
                [padding_chr, 0, 256000000 - chrlen_alt_round, "+"],
            ],
        )

    anno_scaled = process_anno(
        [[mstart, "double"]], base=wpos - window_radius, window_radius=window_radius
    )

    if window_radius == 16000000:
        outputs_alt = genomepredict(
            sequence, mchr, mstart, wpos, models=models, annotation=anno_scaled, use_cuda=use_cuda
        )
        if file is not None:
            genomeplot(outputs_alt, show_coordinates=True, cmap=cmap, file=file + ".alt.pdf")
    else:
        outputs_alt = genomepredict_256Mb(
            sequence,
            mchr,
            normmats,
            chrlen_alt_round,
            mstart,
            wpos,
            models=models,
            annotation=anno_scaled,
            padding_chr=padding_chr,
            use_cuda=use_cuda,
        )
        if file is not None:
            genomeplot_256Mb(
                outputs_alt, show_coordinates=True, file=file + ".alt.256m.pdf",
            )

    return outputs_ref_l, outputs_ref_r, outputs_alt


[docs]def process_inv(
    mchr,
    mstart,
    mend,
    genome,
    file=None,
    custom_models=None,
    target=True,
    show_genes=True,
    show_tracks=False,
    window_radius=16000000,
    padding_chr="chr1",
    use_cuda=True,
):
    """
    Generate multiscale genome interaction predictions for 
    an inversion variant. 

    Parameters
    ----------
    mchr : str
        The chromosome name of the first segment
    mstart : int
        The start coordinate of the inversion.
    mend : ind
        The end coordinate of the inversion.
    genome : selene_utils2.MemmapGenome or selene_sdk.sequences.Genome
        The reference genome object to extract sequence from
    custom_models : list(torch.nn.Module or str) or None, optional
        Models to use instead of the default H1-ESC and HFF Orca models.
        Default is None.
    target : list(selene_utils2.Genomic2DFeatures or str) or bool, optional
        If specified as list, use this list of targets to retrieve experimental
        data (for plotting only). Default is True and will use micro-C data 
        for H1-ESC and HFF cells (4DNFI9GMP2J8, 4DNFI643OYP9) that correspond
        to the default models.
    file : str or None, optional
        Default is None. The output file prefix.
    show_genes : bool, optional
        Default is True. If True, generate gene annotation visualization
        file in pdf format that matches the windows of multiscale predictions.
    show_tracks : bool, optional
        Default is False. If True, generate chromatin tracks visualization
        file in pdf format that matches the windows of multiscale predictions.
    window_radius : int, optional
        Default is 16000000. The acceptable values are 16000000 which selects
        the 1-32Mb models or 128000000 which selects the 32-256Mb models.
    padding_chr : str, optional
        Default is "chr1". If window_radius is 128000000, padding is generally 
        needed to fill the sequence to 256Mb. The padding sequence will be 
        extracted from the padding_chr.
    use_cuda : bool, optional
        Default is True. Use CPU if False.

    Returns
    -------
    outputs_ref_l, outputs_ref_r, outputs_alt_l, outputs_alt_r : dict, dict, dict, dict
        Reference allele predictions zooming into the left boundary of the
        inversion,
        Reference allele predictions zooming into the right boundary of the
        inversion,
        Alternative allele predictions zooming into the left boundary of 
        the inversion,
        Alternative allele prediction zooming into the right boundary of 
        the inversion.
        The returned results are in the format of dictonaries 
        containing the prediction outputs and other 
        retrieved information. These dictionaries can be directly used as
        input to genomeplot or genomeplot_256Mb. See documentation of `genomepredict` or `genomepredict_256Mb` for
        details of the dictionary content.
    """
    chrlen = [l for c, l in genome.get_chr_lens() if c == mchr].pop()

    if custom_models is None:
        if window_radius == 16000000:
            models = ["h1esc", "hff"]
        elif window_radius == 128000000:
            models = ["h1esc_256m", "hff_256m"]
        else:
            raise ValueError(
                "Only window_radius 16000000 (32Mb models) or 128000000 (256Mb models) are supported"
            )
    else:
        models = custom_models

    if target:
        try:
            if target == True:
                if window_radius == 16000000:
                    target = ["h1esc", "hff"]
                elif window_radius == 128000000:
                    target = ["h1esc_256m", "hff_256m"]
            target = [t if isinstance(t, Genomic2DFeatures) else target_dict_global[t] for t in target]
        except KeyError:
            target = False

    if window_radius == 16000000:
        wpos = coord_clip(mstart, chrlen)
        sequence = genome.get_encoding_from_coords(
            mchr, wpos - window_radius, wpos + window_radius
        )[None, :]
        if target:
            targets = [
                torch.FloatTensor(
                    t.get_feature_data(
                        mchr, coord_round(wpos - window_radius), coord_round(wpos + window_radius),
                    )[None, :]
                ) for t in target
            ]
        else:
            targets = None
    elif window_radius == 128000000:
        chrlen_round = chrlen - chrlen % 32000
        wpos = 128000000
        if target:
            sequence, normmats, targets = _retrieve_multi(
                [[mchr, 0, chrlen_round, "+"], [padding_chr, 0, 256000000 - chrlen_round, "+"]],
                genome,
                target=target,
            )
        else:
            sequence, normmats = _retrieve_multi(
                [[mchr, 0, chrlen_round, "+"], [padding_chr, 0, 256000000 - chrlen_round, "+"]],
                genome,
                target=target,
            )
            targets = None
    else:
        raise ValueError(
            "Only window_radius 16000000 (32Mb models) or 128000000 (256Mb models) are supported"
        )

    if wpos + window_radius > mend:
        anno_scaled = process_anno(
            [[mstart, mend, "black"]], base=wpos - window_radius, window_radius=window_radius,
        )
    else:
        anno_scaled = process_anno(
            [[mstart, wpos + window_radius, "black"]],
            base=wpos - window_radius,
            window_radius=window_radius,
        )
    if window_radius == 128000000:
        outputs_ref_l = genomepredict_256Mb(
            sequence,
            mchr,
            normmats,
            chrlen_round,
            mstart,
            wpos,
            models=models,
            annotation=anno_scaled,
            padding_chr=padding_chr,
            targets=targets,
            use_cuda=use_cuda,
        )
    else:
        outputs_ref_l = genomepredict(
            sequence,
            mchr,
            mstart,
            wpos,
            models=models,
            annotation=anno_scaled,
            targets=targets,
            use_cuda=use_cuda,
        )
    if file is not None:
        if window_radius == 128000000:
            genomeplot_256Mb(
                outputs_ref_l, show_coordinates=True, file=file + ".ref.l.256m.pdf",
            )
        else:
            genomeplot(
                outputs_ref_l,
                show_genes=show_genes,
                show_tracks=show_tracks,
                show_coordinates=True,
                file=file + ".ref.l.pdf",
            )

    # ref.r
    if window_radius == 16000000:
        wpos = coord_clip(mend, chrlen)
        sequence = genome.get_encoding_from_coords(
            mchr, wpos - window_radius, wpos + window_radius
        )[None, :]
        if target:
            targets = [
                torch.FloatTensor(
                    t.get_feature_data(
                        mchr, coord_round(wpos - window_radius), coord_round(wpos + window_radius),
                    )[None, :]
                ) for t in target
            ]
        else:
            targets = None

    if wpos - window_radius < mstart:
        anno_scaled = process_anno(
            [[mstart, mend, "black"]], base=wpos - window_radius, window_radius=window_radius,
        )
    else:
        anno_scaled = process_anno(
            [[wpos - window_radius, mend, "black"]],
            base=wpos - window_radius,
            window_radius=window_radius,
        )

    if window_radius == 16000000:
        outputs_ref_r = genomepredict(
            sequence, mchr, mend, wpos, models=models, annotation=anno_scaled, targets=targets, use_cuda=use_cuda,
        )
        if file is not None:
            genomeplot(
                outputs_ref_r,
                show_genes=show_genes,
                show_tracks=show_tracks,
                show_coordinates=True,
                file=file + ".ref.r.pdf",
            )
    else:
        outputs_ref_r = genomepredict_256Mb(
            sequence,
            mchr,
            normmats,
            chrlen_round,
            mend,
            wpos,
            models=models,
            annotation=anno_scaled,
            padding_chr=padding_chr,
            targets=targets,
            use_cuda=use_cuda,
        )
        if file is not None:
            genomeplot_256Mb(
                outputs_ref_r, show_coordinates=True, file=file + ".ref.r.256m.pdf",
            )

    # alt.l
    s = StructuralChange2(mchr, chrlen)
    s.invert(mstart, mend)
    if window_radius == 16000000:
        wpos = coord_clip(mstart, chrlen)
        sequence = []
        for chrm, start, end, strand in s[wpos - window_radius : wpos + window_radius]:
            seq = genome.get_encoding_from_coords(chrm, start, end)
            if strand == "-":
                seq = seq[None, ::-1, ::-1]
            else:
                seq = seq[None, :, :]
            sequence.append(seq)
        sequence = np.concatenate(sequence, axis=1)
    else:
        wpos = 128000000
        (sequence,) = _retrieve_multi(
            list(s[0:chrlen_round]) + [[padding_chr, 0, 256000000 - chrlen_round, "+"]],
            genome,
            target=False,
            normmat=False,
        )

        # normmats are not changed for inversion

    if mend < wpos + window_radius:
        anno_scaled = process_anno(
            [[mstart, mend, "gray"]], base=wpos - window_radius, window_radius=window_radius,
        )
    else:
        anno_scaled = process_anno(
            [[mstart, wpos + window_radius, "gray"]],
            base=wpos - window_radius,
            window_radius=window_radius,
        )

    if window_radius == 16000000:
        outputs_alt_l = genomepredict(
            sequence, mchr, mstart, wpos, models=models, annotation=anno_scaled, use_cuda=use_cuda
        )
        if file is not None:
            genomeplot(outputs_alt_l, show_coordinates=True, file=file + ".alt.l.pdf")
    else:
        outputs_alt_l = genomepredict_256Mb(
            sequence,
            mchr,
            normmats,
            chrlen_round,
            mstart,
            wpos,
            models=models,
            annotation=anno_scaled,
            padding_chr=padding_chr,
            use_cuda=use_cuda,
        )
        if file is not None:
            genomeplot_256Mb(
                outputs_alt_l, show_coordinates=True, file=file + ".alt.l.256m.pdf",
            )

    if window_radius == 16000000:
        wpos = coord_clip(mend, chrlen)
        sequence = []
        for chrm, start, end, strand in s[wpos - window_radius : wpos + window_radius]:
            seq = genome.get_encoding_from_coords(chrm, start, end)
            if strand == "-":
                seq = seq[None, ::-1, ::-1]
            else:
                seq = seq[None, :, :]
            sequence.append(seq)
        sequence = np.concatenate(sequence, axis=1)

    if mstart > wpos - window_radius:
        anno_scaled = process_anno(
            [[mstart, mend, "gray"]], base=wpos - window_radius, window_radius=window_radius,
        )
    else:
        anno_scaled = process_anno(
            [[wpos - window_radius, mend, "gray"]],
            base=wpos - window_radius,
            window_radius=window_radius,
        )
    if window_radius == 16000000:
        outputs_alt_r = genomepredict(
            sequence, mchr, mend, wpos, models=models, annotation=anno_scaled, use_cuda=use_cuda
        )
        if file is not None:
            genomeplot(outputs_alt_r, show_coordinates=True, file=file + ".alt.r.pdf")
    else:
        outputs_alt_r = genomepredict_256Mb(
            sequence,
            mchr,
            normmats,
            chrlen_round,
            mend,
            wpos,
            models=models,
            annotation=anno_scaled,
            padding_chr=padding_chr,
            use_cuda=use_cuda,
        )
        if file is not None:
            genomeplot_256Mb(
                outputs_alt_r, show_coordinates=True, file=file + ".alt.r.256m.pdf",
            )

    return outputs_ref_l, outputs_ref_r, outputs_alt_l, outputs_alt_r


[docs]def process_ins(
    mchr,
    mpos,
    ins_seq,
    genome,
    strand="+",
    file=None,
    custom_models=None,
    target=True,
    show_genes=True,
    show_tracks=False,
    window_radius=16000000,
    padding_chr="chr1",
    use_cuda=True,
):
    """
    Generate multiscale genome interaction predictions for 
    an insertion variant that inserts the specified sequence 
    to the insertion site. 

    Parameters
    ----------
    mchr : str
        The chromosome name of the first segment
    mpos : int
        The insertion site coordinate.
    ins_seq : str
        The inserted sequence in string format.
    genome : selene_utils2.MemmapGenome or selene_sdk.sequences.Genome
        The reference genome object to extract sequence from
    custom_models : list(torch.nn.Module or str) or None, optional
        Models to use instead of the default H1-ESC and HFF Orca models.
        Default is None.
    target : list(selene_utils2.Genomic2DFeatures or str) or bool, optional
        If specified as list, use this list of targets to retrieve experimental
        data (for plotting only). Default is True and will use micro-C data 
        for H1-ESC and HFF cells (4DNFI9GMP2J8, 4DNFI643OYP9) that correspond
        to the default models.
    file : str or None, optional
        Default is None. The output file prefix.
    show_genes : bool, optional
        Default is True. If True, generate gene annotation visualization
        file in pdf format that matches the windows of multiscale predictions.
    show_tracks : bool, optional
        Default is False. If True, generate chromatin tracks visualization
        file in pdf format that matches the windows of multiscale predictions.
    window_radius : int, optional
        Default is 16000000. The acceptable values are 16000000 which selects
        the 1-32Mb models or 128000000 which selects the 32-256Mb models.
    padding_chr : str, optional
        Default is "chr1". If window_radius is 128000000, padding is generally 
        needed to fill the sequence to 256Mb. The padding sequence will be 
        extracted from the padding_chr.
    use_cuda : bool, optional
        Default is True. Use CPU if False.

    Returns
    -------
    outputs_ref, outputs_alt_l, outputs_alt_r : dict, dict, dict
        Reference allele predictions zooming into the insertion site,
        Alternative allele predictions zooming into the left boundary of 
        the insertion seqeunce,
        Alternative allele prediction zooming into the right boundary of 
        the insertion seqeunce.
        The returned results are in the format of dictonaries 
        containing the prediction outputs and other 
        retrieved information. These dictionaries can be directly used as
        input to genomeplot or genomeplot_256Mb. See documentation of `genomepredict` or `genomepredict_256Mb` for
        details of the dictionary content.
    """
    chrlen = [l for c, l in genome.get_chr_lens() if c == mchr].pop()

    if custom_models is None:
        if window_radius == 16000000:
            models = ["h1esc", "hff"]
        elif window_radius == 128000000:
            models = ["h1esc_256m", "hff_256m"]
        else:
            raise ValueError(
                "Only window_radius 16000000 (32Mb models) or 128000000 (256Mb models) are supported"
            )
    else:
        models = custom_models

    if target:
        try:
            if target == True:
                if window_radius == 16000000:
                    target = ["h1esc", "hff"]
                elif window_radius == 128000000:
                    target = ["h1esc_256m", "hff_256m"]
            target = [t if isinstance(t, Genomic2DFeatures) else target_dict_global[t] for t in target]
        except KeyError:
            target = False

    if window_radius == 16000000:
        wpos = coord_clip(mpos, chrlen)
        sequence = genome.get_encoding_from_coords(
            mchr, wpos - window_radius, wpos + window_radius
        )[None, :]
        if target:
            targets = [
                torch.FloatTensor(
                    t.get_feature_data(
                        "chr" + mchr.replace("chr", ""),
                        coord_round(wpos - window_radius),
                        coord_round(wpos + window_radius),
                    )[None, :]
                ) for t in target
            ]
        else:
            targets = None
    elif window_radius == 128000000:
        chrlen_round = chrlen - chrlen % 32000
        wpos = 128000000
        if target:
            sequence, normmats, targets = _retrieve_multi(
                [[mchr, 0, chrlen_round, "+"], [padding_chr, 0, 256000000 - chrlen_round, "+"]],
                genome,
                target=target,
            )
        else:
            sequence, normmats = _retrieve_multi(
                [[mchr, 0, chrlen_round, "+"], [padding_chr, 0, 256000000 - chrlen_round, "+"]],
                genome,
                target=target,
            )
            targets = None
    else:
        raise ValueError(
            "Only window_radius 16000000 (32Mb models) or 128000000 (256Mb models) are supported"
        )

    anno_scaled = process_anno(
        [[mpos, "single"]], base=wpos - window_radius, window_radius=window_radius
    )

    if window_radius == 128000000:
        outputs_ref_l = genomepredict_256Mb(
            sequence,
            mchr,
            normmats,
            chrlen_round,
            mpos,
            wpos,
            models=models,
            annotation=anno_scaled,
            padding_chr=padding_chr,
            targets=targets,
            use_cuda=use_cuda,
        )
    else:
        outputs_ref = genomepredict(
            sequence, mchr, mpos, wpos, annotation=anno_scaled, models=models, targets=targets, use_cuda=use_cuda,
        )

    if file is not None:
        if window_radius == 128000000:
            genomeplot_256Mb(
                outputs_ref_l, show_coordinates=True, file=file + ".ref.256m.pdf",
            )
        else:
            genomeplot(
                outputs_ref,
                show_genes=show_genes,
                show_tracks=show_tracks,
                show_coordinates=True,
                file=file + ".ref.pdf",
            )

    # alt
    s = StructuralChange2(mchr, chrlen)
    s.insert(mpos, len(ins_seq), strand=strand)
    chrlen_alt = chrlen + len(ins_seq)
    if window_radius == 16000000:
        wpos = coord_clip(mpos, chrlen_alt)
        sequence = []
        for chr_name, start, end, strand in s[wpos - window_radius : wpos + window_radius]:
            if chr_name.startswith("ins"):
                seq = Genome.sequence_to_encoding(ins_seq[start:end])
            else:
                seq = genome.get_encoding_from_coords(chr_name, start, end)
            if strand == "-":
                seq = seq[None, ::-1, ::-1]
            else:
                seq = seq[None, :, :]
            sequence.append(seq)
        sequence = np.concatenate(sequence, axis=1)
    else:
        chrlen_alt_round = chrlen_alt - chrlen_alt % 32000
        if chrlen_alt_round < 256000000:
            wpos = 128000000
            (sequence, normmats) = _retrieve_multi(
                list(s[0:chrlen_alt_round]) + [[padding_chr, 0, 256000000 - chrlen_alt_round, "+"]],
                genome,
                target=False,
                normmat=True,
                normmat_regionlist=[
                    [mchr, 0, chrlen_alt_round, "+"],
                    [padding_chr, 0, 256000000 - chrlen_alt_round, "+"],
                ],
            )
        else:
            wpos = coord_clip(mpos, chrlen_alt_round, window_radius=128000000)
            (sequence, normmats) = _retrieve_multi(
                list(s[wpos - window_radius : wpos + window_radius]),
                genome,
                target=False,
                normmat=True,
                normmat_regionlist=[[mchr, wpos - window_radius, wpos + window_radius, "+"]],
            )

    if mpos + len(ins_seq) < wpos + window_radius:
        anno_scaled = process_anno(
            [[mpos, mpos + len(ins_seq), "gray"]],
            base=wpos - window_radius,
            window_radius=window_radius,
        )
    else:
        anno_scaled = process_anno(
            [[mpos, wpos + window_radius, "gray"]],
            base=wpos - window_radius,
            window_radius=window_radius,
        )

    if window_radius == 16000000:
        outputs_alt_l = genomepredict(
            sequence, mchr, mpos, wpos, models=models, annotation=anno_scaled, use_cuda=use_cuda
        )
        if file is not None:
            genomeplot(outputs_alt_l, show_coordinates=True, file=file + ".alt.l.pdf")
    else:
        outputs_alt_l = genomepredict_256Mb(
            sequence,
            mchr,
            normmats,
            chrlen_alt_round,
            mpos,
            wpos,
            models=models,
            annotation=anno_scaled,
            padding_chr=padding_chr,
            use_cuda=use_cuda,
        )
        if file is not None:
            genomeplot_256Mb(
                outputs_alt_l, show_coordinates=True, file=file + ".alt.l.256m.pdf",
            )

    if window_radius == 16000000:
        wpos = coord_clip(mpos + len(ins_seq), chrlen_alt)
        sequence = []
        for chr_name, start, end, strand in s[wpos - window_radius : wpos + window_radius]:
            if chr_name.startswith("ins"):
                seq = Genome.sequence_to_encoding(ins_seq[start:end])
            else:
                seq = genome.get_encoding_from_coords(chr_name, start, end)
            if strand == "-":
                seq = seq[None, ::-1, ::-1]
            else:
                seq = seq[None, :, :]
            sequence.append(seq)
        sequence = np.concatenate(sequence, axis=1)
    else:
        if chrlen_alt_round > 256000000:
            wpos = coord_clip(mpos + len(ins_seq), chrlen_alt_round, window_radius=128000000)
            (sequence, normmats) = _retrieve_multi(
                list(s[wpos - window_radius : wpos + window_radius]),
                genome,
                target=False,
                normmat=True,
                normmat_regionlist=[[mchr, wpos - window_radius, wpos + window_radius, "+"]],
            )

    if mpos > wpos + window_radius:
        anno_scaled = process_anno(
            [[mpos, mpos + len(ins_seq), "gray"]],
            base=wpos - window_radius,
            window_radius=window_radius,
        )
    else:
        anno_scaled = process_anno(
            [[wpos - window_radius, mpos + len(ins_seq), "gray"]],
            base=wpos - window_radius,
            window_radius=window_radius,
        )

    if window_radius == 16000000:
        outputs_alt_r = genomepredict(
            sequence, mchr, mpos + len(ins_seq), wpos, annotation=anno_scaled, use_cuda=use_cuda
        )
        if file is not None:
            genomeplot(outputs_alt_r, show_coordinates=True, file=file + ".alt.r.pdf")
    else:
        outputs_alt = genomepredict_256Mb(
            sequence,
            mchr,
            normmats,
            chrlen_alt_round,
            mpos + len(ins_seq),
            wpos,
            models=models,
            annotation=anno_scaled,
            padding_chr=padding_chr,
            use_cuda=use_cuda,
        )
        if file is not None:
            genomeplot_256Mb(
                outputs_alt, show_coordinates=True, file=file + ".alt.r.256m.pdf",
            )

    return outputs_ref, outputs_alt_l, outputs_alt_r


[docs]def process_custom(
    region_list,
    ref_region_list,
    mpos,
    genome,
    ref_mpos_list=None,
    anno_list=None,
    ref_anno_list=None,
    custom_models=None,
    target=True,
    file=None,
    show_genes=True,
    show_tracks=False,
    window_radius=16000000,
    use_cuda=True,
):
    """
    Generate multiscale genome interaction predictions for 
    a custom variant by an ordered list of genomic segments.

    Parameters
    ----------
    region_list : list(list(...))
        List of segments to complete the alternative. Each segment is specified
        by a list( chr: str, start: int, end: int, strand: str), and segments
        are concatenated together in the given order. The total length 
        should sum up to 32Mb. An example input is
        [['chr5', 89411065, 89411065+16000000, '-'], ['chr7', 94378248, 94378248+16000000,'+']].
    ref_region_list : list(list(...))
        The reference regions to predict. This can be any reference regions with
        the length of the specified window size. If the Each reference region is specified
        with a list( chr: str, start: int, end: int, strand: str). The strand must
        be '+'. The intended use is predicting the genome interactions for each 
        segment that constitute the alternative allele within the native
        reference sequence context. An example
        input is [['chr5', 89411065-16000000, 89411065+16000000,'+'],
        ['chr7', 94378248-16000000, 94378248+16000000,'+']].
    mpos : int
        The position to zoom into in the alternative allele. Note that `mpos`
        here specify the relative position with respect to the to start of the 32Mb.
    genome : selene_utils2.MemmapGenome or selene_sdk.sequences.Genome
        The reference genome object to extract sequence from.
    ref_mpos_list : list(int) or None, optional
        Default is None. List of positions to zoom into for each of the 
        reference regions specified in `ref_region_list`. If not specified, 
        then zoom into the center of each region. Note that `ref_mpos_list`
        specifies the relative positions with respect to start of the 32Mb. 
        For example, `16000000` means the center of the sequence.
    custom_models : list(torch.nn.Module or str) or None, optional
        Models to use instead of the default H1-ESC and HFF Orca models.
        Default is None.
    target : list(selene_utils2.Genomic2DFeatures or str) or bool, optional
        If specified as list, use this list of targets to retrieve experimental
        data (for plotting only). Default is True and will use micro-C data 
        for H1-ESC and HFF cells (4DNFI9GMP2J8, 4DNFI643OYP9) that correspond
        to the default models.
    file : str or None, optional
        Default is None. The output file prefix.
    show_genes : bool, optional
        Default is True. If True, generate gene annotation visualization
        file in pdf format that matches the windows of multiscale predictions.
    show_tracks : bool, optional
        Default is False. If True, generate chromatin tracks visualization
        file in pdf format that matches the windows of multiscale predictions.
    window_radius : int, optional
        Default is 16000000. Currently only 16000000 (32Mb window) is accepted.
    use_cuda : bool, optional
        Default is True. Use CPU if False.

    Returns
    -------
    outputs_ref_l, outputs_ref_r, outputs_alt : dict, dict, dict
        Reference allele predictions zooming into the left boundary of the
        duplication,
        Reference allele predictions zooming into the right boundary of the
        duplication,
        Alternative allele predictions zooming into the duplication breakpoint.
        The returned results are in the format of dictonaries 
        containing the prediction outputs and other 
        retrieved information. These dictionaries can be directly used as
        input to genomeplot or genomeplot_256Mb. See documentation of `genomepredict` or `genomepredict_256Mb` for
        details of the dictionary content.
        
    """
    if custom_models is None:
        if window_radius == 16000000:
            models = ["h1esc", "hff"]
        elif window_radius == 128000000:
            models = ["h1esc_256m", "hff_256m"]
        else:
            raise ValueError(
                "Only window_radius 16000000 (32Mb models) or 128000000 (256Mb models) are supported"
            )
    else:
        models = custom_models

    if target:
        try:
            if target == True:
                if window_radius == 16000000:
                    target = ["h1esc", "hff"]
                elif window_radius == 128000000:
                    target = ["h1esc_256m", "hff_256m"]
            target = [t if isinstance(t, Genomic2DFeatures) else target_dict_global[t] for t in target]
        except KeyError:
            target = False

    def validate_region_list(region_list, enforce_strand=None):
        sumlen = 0
        for chrm, start, end, strand in region_list:
            chrlen = [l for c, l in genome.get_chr_lens() if c == chrm].pop()
            assert start >= 0 and end <= chrlen
            sumlen += end - start
            if enforce_strand:
                if strand != enforce_strand:
                    raise ValueError("The specified strand must be " + enforce_strand)
        assert sumlen == 2 * window_radius

    validate_region_list(region_list)

    for i, ref_region in enumerate(ref_region_list):
        validate_region_list([ref_region], enforce_strand="+")
        ref_sequence = genome.get_encoding_from_coords(*ref_region)[None, :]

        if target:
            targets = [
                torch.FloatTensor(
                    t.get_feature_data(
                        ref_region[0], coord_round(ref_region[1]), coord_round(ref_region[2]),
                    )[None, :]
                ) for t in target
            ]
        else:
            targets = None

        anno_scaled = process_anno(ref_anno_list, base=0, window_radius=window_radius)

        outputs_ref = genomepredict(
            ref_sequence,
            ref_region[0],
            ref_region[1] + window_radius if ref_mpos_list is None else ref_mpos_list[i],
            ref_region[1] + window_radius,
            annotation=anno_scaled,
            models=models,
            targets=targets,
            use_cuda=use_cuda,
        )
        if file is not None:
            genomeplot(
                outputs_ref,
                show_genes=show_genes,
                show_tracks=show_tracks,
                show_coordinates=True,
                file=file + ".ref." + str(i) + ".pdf",
            )

    sequence = []
    for chrm, start, end, strand in region_list:
        seq = genome.get_encoding_from_coords(chrm, start, end)
        if strand == "-":
            seq = seq[None, ::-1, ::-1].copy()
        else:
            seq = seq[None, :, :]
        sequence.append(seq)
    alt_sequence = np.concatenate(sequence, axis=1)

    anno_scaled = process_anno(anno_list, base=0, window_radius=window_radius)

    outputs_alt = genomepredict(
        alt_sequence, "chimeric", mpos, window_radius, models=models, annotation=anno_scaled, use_cuda=use_cuda,
    )
    if file is not None:
        genomeplot(outputs_alt, show_coordinates=False, file=file + ".alt.pdf")
    return outputs_ref, outputs_alt


[docs]def process_single_breakpoint(
    chr1,
    pos1,
    chr2,
    pos2,
    orientation1,
    orientation2,
    genome,
    custom_models=None,
    target=True,
    file=None,
    show_genes=True,
    show_tracks=False,
    window_radius=16000000,
    padding_chr="chr1",
    use_cuda=True,
):
    """
    Generate multiscale genome interaction predictions for
    a simple translocation event that connects 
    two chromosomal breakpoints. Specifically, two breakpoint 
    positions and the corresponding two orientations are needed. 
    The orientations decide how the breakpoints are connected. 
    The ‘+’ or ‘-’ sign indicate whether the left or right side of 
    the breakpoint is used. For example, for an input 
    ('chr1', 85691449, 'chr5', 89533745 '+', '+'), two plus signs
    indicate connecting chr1:0-85691449 with chr5:0-89533745.

    Parameters
    ----------
    chr1 : str
        The chromosome name of the first segment
    pos1 : int
        The coorindate of breakpoint on the first segment
    chr2 : str
        The chromosome name of the second segment
    pos2 : int
        The coorindate of breakpoint on the second segment
    orientation1 : str
        Indicate which side of the breakpoint should be used for
        the first segment,
        '+' indicate the left and '-' indicate the right side.
    orientation2 : str
        Indicate which side of the breakpoint should be used for
        the second segment,
        '+' indicate the left and '-' indicate the right side.
    genome : selene_utils2.MemmapGenome or selene_sdk.sequences.Genome
        The reference genome object to extract sequence from
    custom_models : list(torch.nn.Module or str) or None, optional
        Models to use instead of the default H1-ESC and HFF Orca models.
        Default is None.
    target : list(selene_utils2.Genomic2DFeatures or str) or bool, optional
        If specified as list, use this list of targets to retrieve experimental
        data (for plotting only). Default is True and will use micro-C data 
        for H1-ESC and HFF cells (4DNFI9GMP2J8, 4DNFI643OYP9) that correspond
        to the default models.
    file : str or None, optional
        Default is None. The output file prefix.
    show_genes : bool, optional
        Default is True. If True, generate gene annotation visualization
        file in pdf format that matches the windows of multiscale predictions.
    show_tracks : bool, optional
        Default is False. If True, generate chromatin tracks visualization
        file in pdf format that matches the windows of multiscale predictions.
    window_radius : int, optional
        Default is 16000000. The acceptable values are 16000000 which selects
        the 1-32Mb models or 128000000 which selects the 32-256Mb models.
    padding_chr : str, optional
        Default is "chr1". If window_radius is 128000000, padding is generally 
        needed to fill the sequence to 256Mb. The padding sequence will be 
        extracted from the padding_chr.
    use_cuda : bool, optional
        Default is True. Use CPU if False.

    Returns
    -------
    outputs_ref_1, outputs_ref_2, outputs_alt : dict, dict, dict
        Reference allele predictions zooming into the chr1 breakpoint,
        Reference allele predictions zooming into the chr2 breakpoint,
        Alternative allele prediction zooming into the junction.
        The returned results are in the format of dictonaries 
        containing the prediction outputs and other 
        retrieved information. These dictionaries can be directly used as
        input to genomeplot or genomeplot_256Mb. See documentation of `genomepredict` or `genomepredict_256Mb` for
        details of the dictionary content.
    """

    if custom_models is None:
        if window_radius == 16000000:
            models = ["h1esc", "hff"]
        elif window_radius == 128000000:
            models = ["h1esc_256m", "hff_256m"]
        else:
            raise ValueError(
                "Only window_radius 16000000 (32Mb models) or 128000000 (256Mb models) are supported"
            )
    else:
        models = custom_models

    if target:
        try:
            if target == True:
                if window_radius == 16000000:
                    target = ["h1esc", "hff"]
                elif window_radius == 128000000:
                    target = ["h1esc_256m", "hff_256m"]
            target = [t if isinstance(t, Genomic2DFeatures) else target_dict_global[t] for t in target]
        except KeyError:
            target = False

    chrlen1 = [l for c, l in genome.get_chr_lens() if c == chr1].pop()
    # ref.l
    if window_radius == 16000000:
        wpos = coord_clip(pos1, chrlen1)
        sequence = genome.get_encoding_from_coords(
            chr1, wpos - window_radius, wpos + window_radius
        )[None, :]
        if target:
            targets = [
                torch.FloatTensor(
                    t.get_feature_data(
                        chr1, coord_round(wpos - window_radius), coord_round(wpos + window_radius),
                    )[None, :]
                ) for t in target
            ]
        else:
            targets = None
    elif window_radius == 128000000:
        chrlen1_round = chrlen1 - chrlen1 % 32000
        wpos = 128000000
        if target:
            sequence, normmats, targets = _retrieve_multi(
                [[chr1, 0, chrlen1_round, "+"], [padding_chr, 0, 256000000 - chrlen1_round, "+"]],
                genome,
                target=target,
            )
        else:
            sequence, normmats = _retrieve_multi(
                [[chr1, 0, chrlen1_round, "+"], [padding_chr, 0, 256000000 - chrlen1_round, "+"]],
                genome,
                target=target,
            )
            targets = None
    else:
        raise ValueError(
            "Only window_radius 16000000 (32Mb models) or 128000000 (256Mb models) are supported"
        )

    anno_scaled = process_anno(
        [[pos1, "single"]], base=wpos - window_radius, window_radius=window_radius
    )

    if window_radius == 128000000:
        outputs_ref_1 = genomepredict_256Mb(
            sequence,
            chr1,
            normmats,
            chrlen1_round,
            pos1,
            wpos,
            models=models,
            annotation=anno_scaled,
            padding_chr=padding_chr,
            targets=targets,
            use_cuda=use_cuda,
        )
    else:
        outputs_ref_1 = genomepredict(
            sequence, chr1, pos1, wpos, models=models, annotation=anno_scaled, targets=targets, use_cuda=use_cuda,
        )
    if file is not None:
        if window_radius == 128000000:
            genomeplot_256Mb(
                outputs_ref_1, show_coordinates=True, file=file + ".ref.1.256m.pdf",
            )
        else:
            genomeplot(
                outputs_ref_1,
                show_genes=show_genes,
                show_tracks=show_tracks,
                show_coordinates=True,
                file=file + ".ref.1.pdf",
                colorbar=True,
            )

    chrlen2 = [l for c, l in genome.get_chr_lens() if c == chr2].pop()
    if window_radius == 16000000:
        wpos = coord_clip(pos2, chrlen2)

        sequence = genome.get_encoding_from_coords(
            chr2, wpos - window_radius, wpos + window_radius
        )[None, :]
        if target:
            targets = [
                torch.FloatTensor(
                    t.get_feature_data(
                        chr2, coord_round(wpos - window_radius), coord_round(wpos + window_radius),
                    )[None, :]
                ) for t in target
            ]
        else:
            targets = None
    elif window_radius == 128000000:
        chrlen2_round = chrlen2 - chrlen2 % 32000
        wpos = 128000000
        if target:
            sequence, normmats, targets = _retrieve_multi(
                [[chr2, 0, chrlen2_round, "+"], [padding_chr, 0, 256000000 - chrlen2_round, "+"]],
                genome,
                target=target,
            )
        else:
            sequence, normmats = _retrieve_multi(
                [[chr2, 0, chrlen2_round, "+"], [padding_chr, 0, 256000000 - chrlen2_round, "+"]],
                genome,
                target=target,
            )
            targets = None

    anno_scaled = process_anno(
        [[pos2, "single"]], base=wpos - window_radius, window_radius=window_radius
    )

    if window_radius == 128000000:
        outputs_ref_2 = genomepredict_256Mb(
            sequence,
            chr2,
            normmats,
            chrlen2_round,
            pos2,
            wpos,
            models=models,
            annotation=anno_scaled,
            padding_chr=padding_chr,
            targets=targets,
            use_cuda=use_cuda,
        )
    else:
        outputs_ref_2 = genomepredict(
            sequence, chr2, pos2, wpos, models=models, annotation=anno_scaled, targets=targets, use_cuda=use_cuda,
        )

    if file is not None:
        if window_radius == 128000000:
            genomeplot_256Mb(
                outputs_ref_2, show_coordinates=True, file=file + ".ref.2.256m.pdf",
            )
        else:
            genomeplot(
                outputs_ref_2,
                show_genes=show_genes,
                show_tracks=show_tracks,
                show_coordinates=True,
                file=file + ".ref.2.pdf",
                colorbar=True,
            )

    chrlen = [l for c, l in genome.get_chr_lens() if c == chr1].pop()
    s = StructuralChange2(chr1, chrlen)
    if orientation1 == "+":
        s.delete(pos1, chrlen)
    else:
        s.delete(0, pos1 - 1)
        s.invert(0, chrlen - pos1 + 1)

    chrlen = [l for c, l in genome.get_chr_lens() if c == chr2].pop()
    s2 = StructuralChange2(chr2, chrlen)
    if orientation2 == "-":
        s2.delete(0, pos2 - 1)
    else:
        s2.delete(pos2, chrlen)
        s2.invert(0, pos2)

    breakpos = s.coord_points[-1]
    s = s + s2

    if window_radius == 16000000:
        wpos = coord_clip(breakpos, s.coord_points[-1])

        sequence = []
        curpos = 0
        anno = []
        for chrm, start, end, strand in s[wpos - window_radius : wpos + window_radius]:
            seq = genome.get_encoding_from_coords(chrm, start, end)
            if strand == "-":
                seq = seq[None, ::-1, ::-1]
            else:
                seq = seq[None, :, :]
            sequence.append(seq)
            anno.append([curpos, curpos + end - start])
            curpos = curpos + end - start
        sequence = np.concatenate(sequence, axis=1)
    else:
        chrlen_alt_round = s.coord_points[-1] - s.coord_points[-1] % 32000
        if chrlen_alt_round < 256000000:
            wpos = 128000000
            (sequence, normmats) = _retrieve_multi(
                list(s[0:chrlen_alt_round]) + [[padding_chr, 0, 256000000 - chrlen_alt_round, "+"]],
                genome,
                target=False,
                normmat=True,
                normmat_regionlist=[
                    [chr1 + "|" + chr2, 0, chrlen_alt_round, "+"],
                    [padding_chr, 0, 256000000 - chrlen_alt_round, "+"],
                ],
            )
            curpos = 0
            anno = []
            for chrm, start, end, strand in s[0:chrlen_alt_round]:
                anno.append([curpos, curpos + end - start])
                curpos = curpos + end - start
        else:
            wpos = coord_clip(breakpos, chrlen_alt_round, window_radius=128000000)
            (sequence, normmats) = _retrieve_multi(
                list(s[wpos - window_radius : wpos + window_radius]),
                genome,
                target=False,
                normmat=True,
                normmat_regionlist=[
                    [chr1 + "|" + chr2, wpos - window_radius, wpos + window_radius, "+"]
                ],
            )
            curpos = 0
            anno = []
            for chrm, start, end, strand in s[wpos - window_radius : wpos + window_radius]:
                anno.append([curpos, curpos + end - start])
                curpos = curpos + end - start

    anno_scaled = process_anno([[anno[0][-1], "double"]], base=0, window_radius=window_radius)
    if window_radius == 16000000:
        outputs_alt = genomepredict(
            sequence, chr1 + "|" + chr2, breakpos, wpos, models=models, annotation=anno_scaled, use_cuda=use_cuda
        )
        if file is not None:
            genomeplot(outputs_alt, show_coordinates=False, file=file + ".alt.pdf", colorbar=True)
    else:
        outputs_alt = genomepredict_256Mb(
            sequence,
            chr1 + "|" + chr2,
            normmats,
            chrlen_alt_round,
            breakpos,
            wpos,
            models=models,
            annotation=anno_scaled,
            padding_chr=padding_chr,
            use_cuda=use_cuda,
        )
        if file is not None:
            genomeplot_256Mb(
                outputs_alt, show_coordinates=True, file=file + ".alt.256m.pdf",
            )
    return outputs_ref_1, outputs_ref_2, outputs_alt


if __name__ == "__main__":
    from docopt import docopt
    import sys
    import os

    doc = """
    Orca multiscale genome interaction sequence model prediction tool.

    Usage:
    orca_predict region [options] <coordinate> <output_dir>
    orca_predict del [options] <coordinate> <output_dir>
    orca_predict dup [options] <coordinate> <output_dir>
    orca_predict inv [options] <coordinate> <output_dir>
    orca_predict break [options] <coordinate> <output_dir>

    Options:
    -h --help       Show this screen.
    --show_genes    Show gene annotation (only supported for 32Mb models).
    --show_tracks   Show chromatin tracks (only supported for 32Mb models).
    --256m          Use 256Mb models (default is 32Mb).
    --nocuda        Use CPU implementation. 
    --version       Show version.
    """
    if len(sys.argv) == 1:
        sys.argv.append("-h")
    arguments = docopt(doc, version="Orca v0.1")
    show_genes = arguments["--show_genes"]
    show_tracks = arguments["--show_tracks"]
    window_radius = 128000000 if arguments["--256m"] else 16000000
    use_cuda = not arguments["--nocuda"]

    load_resources(models=["32M"], use_cuda=use_cuda)

    if arguments["region"]:
        predtype = "region"
    elif arguments["del"]:
        predtype = "del"
    elif arguments["dup"]:
        predtype = "dup"
    elif arguments["inv"]:
        predtype = "inv"
    elif arguments["break"]:
        predtype = "break"

    def predict(chrm, start, end, savedir):

        if not os.path.exists(savedir):
            os.makedirs(savedir)

        with torch.no_grad():
            outputs = process_region(
                chrm,
                start,
                end,
                hg38,
                target=target_available,
                file=savedir + "/orca_pred",
                show_genes=show_genes,
                show_tracks=show_tracks,
                window_radius=window_radius,
                padding_chr="chr1",
                use_cuda=use_cuda,
            )
        torch.save(outputs, savedir + "/orca_pred.pth")
        return None

    def get_interactions(predtype, content, savedir):

        if predtype == "region":
            pdf_names = ["orca_pred.pdf"]
            if show_genes or show_tracks:
                pdf_names += ["orca_pred.anno.pdf"]
            chrstr, coordstr = str(content).split(":")
            chrstr = "chr" + chrstr.replace("chr", "")
            coord_s, coord_e = coordstr.split("-")
            predict(chrstr, int(coord_s), int(coord_e), savedir)
        elif predtype in ["dup", "del"]:
            pdf_names = ["orca_pred.ref.l.pdf", "orca_pred.ref.r.pdf", "orca_pred.alt.pdf"]
            if show_genes or show_tracks:
                pdf_names += [
                    "orca_pred.ref.l.anno.pdf",
                    "orca_pred.ref.r.anno.pdf",
                    "orca_pred.alt.anno.pdf",
                ]
            chrstr, coordstr = str(content).split(":")
            chrstr = "chr" + chrstr.replace("chr", "")
            coord_s, coord_e = coordstr.split("-")

            if not os.path.exists(savedir):
                os.makedirs(savedir)

            if predtype == "dup":
                outputs_ref_l, outputs_ref_r, outputs_alt = process_dup(
                    chrstr,
                    int(coord_s),
                    int(coord_e),
                    hg38,
                    target=target_available,
                    show_genes=show_genes,
                    show_tracks=show_tracks,
                    file=savedir + "/orca_pred",
                    window_radius=window_radius,
                    use_cuda=use_cuda,
                )
            else:
                outputs_ref_l, outputs_ref_r, outputs_alt = process_del(
                    chrstr,
                    int(coord_s),
                    int(coord_e),
                    hg38,
                    target=target_available,
                    show_genes=show_genes,
                    show_tracks=show_tracks,
                    file=savedir + "/orca_pred",
                    window_radius=window_radius,
                    use_cuda=use_cuda,
                )
            torch.save(
                {
                    "outputs_ref_l": outputs_ref_l,
                    "outputs_ref_r": outputs_ref_r,
                    "outputs_alt": outputs_alt,
                },
                savedir + "/orca_pred.pth",
            )
        elif predtype == "inv":
            pdf_names = [
                "orca_pred.ref.l.pdf",
                "orca_pred.ref.r.pdf",
                "orca_pred.alt.l.pdf",
                "orca_pred.alt.r.pdf",
            ]
            if show_genes or show_tracks:
                pdf_names += [
                    "orca_pred.ref.l.anno.pdf",
                    "orca_pred.ref.r.anno.pdf",
                    "orca_pred.alt.l.anno.pdf",
                    "orca_pred.alt.r.anno.pdf",
                ]
            chrstr, coordstr = str(content).split(":")
            chrstr = "chr" + chrstr.replace("chr", "")
            coord_s, coord_e = coordstr.split("-")

            if not os.path.exists(savedir):
                os.makedirs(savedir)

            outputs_ref_l, outputs_ref_r, outputs_alt_l, outputs_alt_r = process_inv(
                chrstr,
                int(coord_s),
                int(coord_e),
                hg38,
                target=target_available,
                show_genes=show_genes,
                show_tracks=show_tracks,
                file=savedir + "/orca_pred",
                window_radius=window_radius,
                use_cuda=use_cuda,
            )

            torch.save(
                {
                    "outputs_ref_l": outputs_ref_l,
                    "outputs_ref_r": outputs_ref_r,
                    "outputs_alt_l": outputs_alt_l,
                    "outputs_alt_r": outputs_alt_r,
                },
                savedir + "/orca_pred.pth",
            )
        elif predtype == "break":
            pdf_names = ["orca_pred.ref.1.pdf", "orca_pred.ref.2.pdf", "orca_pred.alt.pdf"]
            if show_genes or show_tracks:
                pdf_names += [
                    "orca_pred.ref.1.anno.pdf",
                    "orca_pred.ref.2.anno.pdf",
                    "orca_pred.alt.anno.pdf",
                ]
            chr_coord_1, chr_coord_2, orientations = str(content.replace("\t", " ")).split(" ")
            chr1, coord1 = chr_coord_1.split(":")
            chr2, coord2 = chr_coord_2.split(":")
            chr1 = "chr" + chr1.replace("chr", "")
            chr2 = "chr" + chr2.replace("chr", "")
            orientation1, orientation2 = orientations.split("/")

            if not os.path.exists(savedir):
                os.makedirs(savedir)

            outputs_ref_1, outputs_ref_2, outputs_alt = process_single_breakpoint(
                chr1,
                int(coord1),
                chr2,
                int(coord2),
                orientation1,
                orientation2,
                hg38,
                target=target_available,
                show_genes=show_genes,
                show_tracks=show_tracks,
                file=savedir + "/orca_pred",
                window_radius=window_radius,
                use_cuda=use_cuda,
            )

            torch.save(
                {
                    "outputs_ref_1": outputs_ref_1,
                    "outputs_ref_2": outputs_ref_2,
                    "outputs_alt": outputs_alt,
                },
                savedir + "/orca_pred.pth",
            )
        else:
            raise ValueError("Unexpected prediction type!")
        return None

    get_interactions(predtype, arguments["<coordinate>"], arguments["<output_dir>"])