# Install Required Python Packages (First Run Only)
= False # Set to False after first run
first_time
if first_time:
%pip install tensorflow
## this requires numpy<2.2.0,>=1.26.0 (from tensorflow)
## it will uninstall numpy not compatible with tensorflow
import platform
if platform.processor() == 'arm':
print("Apple Silicon detected, installing tensorflow-metal...")
%pip install tensorflow-metal
else:
print("Not running on Apple Silicon, skipping tensorflow-metal installation")
%pip install tensorflow_hub
# %pip install joblib # already installed
%pip install kipoiseq
# %pip install pyfaidx # already installed
# %pip install pandas # already installed
# %pip install numpy # already installed
# %pip install matplotlib # already installed
# %pip install seaborn # already installed
%pip install cyvcf2
%pip install Bio
Enformer usage neanderthal - jupyter notebook version
modified from Enformer usage notebook in https://github.com/google-deepmind/deepmind-research/blob/master/enformer/enformer-usage.ipynb
Highly recommended: create a new conda environment
If you install the tensorflow and other packages needed for this notebook, it will downgrade numpy and probably other packages and stop working for other notebooks. I strongly recommend to clone the gene46100 conda environment and install the python packages as needed.
conda create --name newenv46100 --clone oldenv46100
Steps
This notebook demonstrates how to - Make predictions with Enformer using human reference - Make predictions using Neanderthal vcf
1. set up and function definitions
This is Sabrina’s EnformerVCF.py file with functions necessary to run vcf modified enformer, based on functions from Temi and Sai, in turn based on Avsec et al’s code
Install Required Python Packages (First Run Only)
Import Libraries and Define Utility Functions
# Import Libraries and Define Utility Functions
import tensorflow as tf
# Make sure the GPU is enabled
assert tf.config.list_physical_devices('GPU'), 'Start the colab kernel with GPU: Runtime -> Change runtime type -> GPU'
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
import tensorflow_hub as hub # for interacting with saved models and tensorflow hub
import joblib
import gzip # for manipulating compressed files
import kipoiseq # for manipulating fasta files
from kipoiseq import Interval # same as above, really
import pyfaidx # to index our reference genome file
import pandas as pd # for manipulating dataframes
import numpy as np # for numerical computations
import matplotlib.pyplot as plt # for plotting
import matplotlib as mpl # for plotting
import seaborn as sns # for plotting
import pickle # for saving large objects
import os, sys # functions for interacting with the operating system
import cyvcf2
import Bio
from Bio.Seq import Seq
def create_rev_complement(dna_string):
return(str(Seq(dna_string).reverse_complement()))
import io
import os
import gzip
Num GPUs Available: 1
Define Enformer Model Classes and Sequence Extraction
# Define Enformer Model Classes and Sequence Extraction
= 393216
SEQUENCE_LENGTH
class Enformer:
def __init__(self, tfhub_url):
self._model = hub.load(tfhub_url).model
def predict_on_batch(self, inputs):
= self._model.predict_on_batch(inputs)
predictions return {k: v.numpy() for k, v in predictions.items()}
@tf.function
def contribution_input_grad(self, input_sequence,
='human'):
target_mask, output_head= input_sequence[tf.newaxis]
input_sequence
= tf.reduce_sum(target_mask)
target_mask_mass with tf.GradientTape() as tape:
tape.watch(input_sequence)= tf.reduce_sum(
prediction *
target_mask[tf.newaxis] self._model.predict_on_batch(input_sequence)[output_head]) / target_mask_mass
= tape.gradient(prediction, input_sequence) * input_sequence
input_grad = tf.squeeze(input_grad, axis=0)
input_grad return tf.reduce_sum(input_grad, axis=-1)
class EnformerScoreVariantsRaw:
def __init__(self, tfhub_url, organism='human'):
self._model = Enformer(tfhub_url)
self._organism = organism
def predict_on_batch(self, inputs):
= self._model.predict_on_batch(inputs['ref'])[self._organism]
ref_prediction = self._model.predict_on_batch(inputs['alt'])[self._organism]
alt_prediction
return alt_prediction.mean(axis=1) - ref_prediction.mean(axis=1)
class EnformerScoreVariantsNormalized:
def __init__(self, tfhub_url, transform_pkl_path,
='human'):
organismassert organism == 'human', 'Transforms only compatible with organism=human'
self._model = EnformerScoreVariantsRaw(tfhub_url, organism)
with tf.io.gfile.GFile(transform_pkl_path, 'rb') as f:
= joblib.load(f)
transform_pipeline self._transform = transform_pipeline.steps[0][1] # StandardScaler.
def predict_on_batch(self, inputs):
= self._model.predict_on_batch(inputs)
scores return self._transform.transform(scores)
class EnformerScoreVariantsPCANormalized:
def __init__(self, tfhub_url, transform_pkl_path,
='human', num_top_features=500):
organismself._model = EnformerScoreVariantsRaw(tfhub_url, organism)
with tf.io.gfile.GFile(transform_pkl_path, 'rb') as f:
self._transform = joblib.load(f)
self._num_top_features = num_top_features
def predict_on_batch(self, inputs):
= self._model.predict_on_batch(inputs)
scores return self._transform.transform(scores)[:, :self._num_top_features]
# @title `variant_centered_sequences`
class FastaStringExtractor:
def __init__(self, fasta_file):
self.fasta = pyfaidx.Fasta(fasta_file)
self._chromosome_sizes = {k: len(v) for k, v in self.fasta.items()}
#import pd.Interval as Interval
def extract(self, interval: Interval, **kwargs) -> str:
# Truncate interval if it extends beyond the chromosome lengths.
= self._chromosome_sizes[interval.chrom]
chromosome_length = Interval(interval.chrom,
trimmed_interval max(interval.start, 0),
min(interval.end, chromosome_length),
)# pyfaidx wants a 1-based interval
= str(self.fasta.get_seq(trimmed_interval.chrom,
sequence + 1,
trimmed_interval.start
trimmed_interval.stop).seq).upper()# Fill truncated values with N's.
= 'N' * max(-interval.start, 0)
pad_upstream = 'N' * max(interval.end - chromosome_length, 0)
pad_downstream return pad_upstream + sequence + pad_downstream
def close(self):
return self.fasta.close()
Data Processing and Visualization Functions
# Data Processing and Visualization Functions
def one_hot_encode(sequence):
return kipoiseq.transforms.functional.one_hot_dna(sequence).astype(np.float32)
# @title `plot_tracks`
def plot_tracks(tracks, interval, height=1.5):
= plt.subplots(len(tracks), 1, figsize=(20, height * len(tracks)), sharex=True)
fig, axes for ax, (title, y) in zip(axes, tracks.items()):
=len(y)), y)
ax.fill_between(np.linspace(interval.start, interval.end, num
ax.set_title(title)=True, right=True, bottom=True)
sns.despine(topstr(interval))
ax.set_xlabel(
plt.tight_layout()
def read_vcf(path):
with gzip.open(path, 'rt') as f:
= [l for l in f if not l.startswith('##')]
lines return pd.read_csv(
''.join(lines)),
io.StringIO(={'#CHROM': str, 'POS': int, 'ID': str, 'REF': str, 'ALT': str,
dtype'QUAL': str, 'FILTER': str, 'INFO': str},
='\t'
sep={'#CHROM': 'CHROM'})
).rename(columns
# def vcf_to_seq(target_interval, individual, vcf, fasta_extractor):
# ## should be replaced with vcf_to_seq_faster
# target_fa = fasta_extractor.extract(target_interval.resize(SEQUENCE_LENGTH))
# window_coords = target_interval.resize(SEQUENCE_LENGTH)
# # two haplotypes per individual
# haplo_1 = list(target_fa[:])
# haplo_2 = list(target_fa[:])
# ref_mismatch_count = 0
# for i,row in vcf.iterrows():
# geno = row[individual].split("|")
# if (row["POS"]-window_coords.start-1) >= len(haplo_2):
# continue
# if (row["POS"]-window_coords.start-1) < 0:
# continue
# if geno[0] == "1":
# haplo_1[row["POS"]-window_coords.start-1] = row["ALT"]
# if geno[1] == "1":
# haplo_2[row["POS"]-window_coords.start-1] = row["ALT"]
# return haplo_1, haplo_2
def vcf_to_seq_faster(target_interval, individual, vcf_file, fasta_extractor):
# target_inverval is a kipoiseq interval, e.g. kipoiseq.Interval("chr22", 18118779, 18145669)
# individual is the id of the individual in the vcf file
= fasta_extractor.extract(target_interval.resize(SEQUENCE_LENGTH))
target_fa = target_interval.resize(SEQUENCE_LENGTH)
window_coords # two haplotypes per individual
= list(target_fa[:])
haplo_1 = list(target_fa[:])
haplo_2
# Open the VCF file
= cyvcf2.VCF(vcf_file)
vcf_reader
# Specific genomic region
= window_coords.chrom
CHROM = max(1,window_coords.start)
START = min(window_coords.end, fasta_extractor._chromosome_sizes[CHROM]-1)
END
= 0
count1 = 0
count2
# Iterate through variants in the specified region
for variant in vcf_reader(CHROM + ':' + str(START) + '-' + str(END)):
# Access the individual's genotype using index (0-based) of the sample
= vcf_reader.samples.index(individual)
individual_index = variant.genotypes[individual_index]
genotype =variant.ALT[0]
ALT=variant.REF
REF=variant.POS
POSif REF == target_fa[POS - window_coords.start - 1]:
if genotype[0] == 1:
- window_coords.start - 1] = ALT
haplo_1[POS = count1 + 1
count1 if genotype[1] == 1:
- window_coords.start - 1] = ALT
haplo_2[POS = count2 + 1
count2 else:
print("ERROR: REF in vcf "+ REF + "!= REF from the build" + target_fa[POS - window_coords.start - 1])
print("number of changes haplo1:")
print(count1)
print("number of changes haplo2:")
print(count2)
return haplo_1, haplo_2
Define comparison functions
# Comparison and Summary Statistics Functions
def get_diffmat(mat1, mat2):
= mat1 - mat2
diffmat = np.abs(diffmat)
abs_diffmat
= np.max(mat1, axis=0)
colwise_maxes1 = np.max(mat2, axis=0)
colwise_maxes2
= np.maximum(colwise_maxes1, colwise_maxes2)
colwise_maxes_maxes
= diffmat / colwise_maxes_maxes
relmax3_diffmat = np.abs(relmax3_diffmat)
relmax3_diffmat
return relmax3_diffmat
def get_summary(arr):
= {
summary "mean": np.mean(arr),
"median": np.median(arr),
"minimum": np.min(arr),
"maximum": np.max(arr),
"q1": np.percentile(arr, 25),
"q3": np.percentile(arr, 75),
}return summary
def plot_hist(arr, bin_num, xlab='Value', ylab='Frequency', title='Histogram'):
=bin_num)
plt.hist(arr, bins
plt.title(title)
plt.xlabel(xlab)
plt.ylabel(ylab)
plt.show()
def column_correlations(mat1, mat2):
if mat1.shape != mat2.shape:
raise ValueError("Input matrices must have the same shape")
= mat1.shape[1]
num_columns = np.empty(num_columns)
correlations
for col in range(num_columns):
= np.corrcoef(mat1[:, col], mat2[:, col])[0, 1]
correlation = correlation
correlations[col]
return correlations
2. Set File Paths and Load Enformer Model
If first time, download the model from here https://uchicago.box.com/s/ppzn9lqqsnr3i9jqcgc52zf668sllkjx
and the hg19 fasta file from here https://uchicago.box.com/s/0rh4q4syucn66ne1d8n2aw9g3yyst9a0
If needed, you can also download the hg38 fasta file from here (but I believe neanderthal vcf is based on hg19) https://uchicago.box.com/s/wl50ji7jms2c8alyqxyk4q6uru37nnt9
# Set File Paths and Load Enformer Model
## edit this path to the location of the files on your computer
= "/Users/haekyungim/Library/CloudStorage/Box-Box/LargeFiles/imlab-data/Reference-Data/"
PRE
= PRE + "models/enformer/raw"
model_path = PRE + "ref_sequences/hg19/raw/genome.fa"
fasta_file ## check whether specific reference fasta used for the calling of the neanderthal vcf should be used
#fasta_file = PRE + "ref_sequences/hg38/Homo_sapiens_assembly38.fasta"
= Enformer(model_path) # here we load the model architecture.
model = FastaStringExtractor(fasta_file) fasta_extractor
2025-05-01 18:57:44.433668: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M2 Pro
2025-05-01 18:57:44.433703: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 32.00 GB
2025-05-01 18:57:44.433706: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 10.67 GB
WARNING: All log messages before absl::InitializeLog() is called are written to STDERR
I0000 00:00:1746143864.434317 634286 pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
I0000 00:00:1746143864.434366 634286 pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
Load Target Annotation Data
## Load Target Annotation Data
= 'https://raw.githubusercontent.com/calico/basenji/master/manuscripts/cross2020/targets_human.txt'
targets_txt # df_targets = pd.read_csv(targets_txt, sep='\t')
= PRE + "models/enformer/targets_slims.csv"
targets_slim_file = pd.read_csv(targets_slim_file) targets_slim_df
3. Run Enformer on Neanderthal genomes
Shell Script: Preprocess Neanderthal VCF Files
Download Altai ch5 filtered vcf brew install htslib bgzip AltaiNea.hg19_1000g.5.vcf tabix -p vcf AltaiNea.hg19_1000g.5.vcf.gz
create file filter-add-chr.sh with the following content chmod u+x filter-add-chr.sh to make it executable
```{bash}
#!/bin/bash
for NUM in {1..22}; do
# Filter missing genotypes and non-variant sites
bcftools view -e '(GT="./.") || (GT="0/0") || (ALT=".")' AltaiNea.hg19_1000g.${NUM}.vcf.gz > AltaiNea.hg19_1000g.${NUM}.nomiss.vcf
# Compress the resulting VCF
bgzip AltaiNea.hg19_1000g.${NUM}.nomiss.vcf
# Add "chr" prefix to all non-header lines and compress
# zcat < ... is used on a mac terminal; in linux, it should be without <,i.e., zcat AltaiNea...
zcat < AltaiNea.hg19_1000g.${NUM}.nomiss.vcf.gz | awk 'BEGIN{OFS=FS="\t"} /^#/ {print; next} {print "chr"$0}' | bgzip > AltaiNea.hg19_1000g.chr${NUM}.nomiss.vcf.gz
# Filter to retain only SNPs
bcftools view -i 'strlen(REF) = 1 && strlen(ALT) = 1' AltaiNea.hg19_1000g.chr${NUM}.nomiss.vcf.gz > AltaiNea.hg19_1000g.chr${NUM}.nomiss.snps_only.vcf
done
```
Load Neanderthal vcf and predict epigenome
# Load Neanderthal vcf and predict epigenome
# download the vcf file from here https://uchicago.box.com/s/f682q1c6tl3cdnqwbvga0z72u5e203zs
# and put it in your data folder
# read VCFs and encode haplotypes
='chr5'
CHROM= PRE + "neanderthal/AltaiNea.hg19_1000g." + CHROM + ".nomiss.snps_only.vcf.gz"
vcf_file
= kipoiseq.Interval(CHROM,96875939 , 96919716)
target_interval = vcf_to_seq_faster(target_interval, 'AltaiNea', vcf_file, fasta_extractor)
haplo1, haplo2 = fasta_extractor.extract(target_interval.resize(SEQUENCE_LENGTH))
haplo0
= one_hot_encode("".join(haplo1))[np.newaxis]
haplo1_enc = one_hot_encode("".join(haplo2))[np.newaxis]
haplo2_enc = one_hot_encode("".join(haplo0))[np.newaxis]
haplo0_enc
print("number of changes");print(np.sum(haplo2_enc != haplo0_enc))
= model.predict_on_batch(haplo0_enc)['human'][0]
pred_human = model.predict_on_batch((haplo1_enc + haplo2_enc)/2)['human'][0] pred_altai
[W::bcf_hrec_check] Invalid tag name: "1000gALT"
[W::vcf_parse_filter] FILTER 'LowQual' is not defined in the header
number of changes haplo1:
538
number of changes haplo2:
650
number of changes
1300
2025-05-01 19:00:48.346036: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.
Plot Human reference epigenome
# Plot Human epigenomes
= pred_human
predictions = {'DNASE:CD14-positive monocyte female': predictions[:, 41],
tracks 'DNASE:keratinocyte female': predictions[:, 42],
'CHIP:H3K27ac:keratinocyte female': predictions[:, 706],
'CAGE:LCL': np.log10(1 + predictions[:, 5110])}
plot_tracks(tracks, target_interval)
Plot Neanderthal epigenome
= pred_altai
predictions = {'DNASE:CD14-positive monocyte female': predictions[:, 41],
tracks 'DNASE:keratinocyte female': predictions[:, 42],
'CHIP:H3K27ac:keratinocyte female': predictions[:, 706],
'CAGE:LCL': np.log10(1 + predictions[:, 5110])}
plot_tracks(tracks, target_interval)
get_summary(get_diffmat(pred_human,pred_altai)) get_summary(column_correlations(pred_human,pred_altai))
{'mean': np.float64(0.9963786626060424),
'median': np.float64(0.9985426207070724),
'minimum': np.float64(0.8450667250216665),
'maximum': np.float64(0.9999900682433824),
'q1': np.float64(0.9971584662782969),
'q3': np.float64(0.9990126346399585)}
Create scatter plots comparing human and Neanderthal predictions
def plot_prediction_scatters(pred_human, pred_altai, tracks):
= plt.subplots(2, 2, figsize=(12, 12))
fig, axes = axes.flatten()
axes
for idx, (track_name, track_idx) in enumerate(tracks.items()):
= axes[idx]
ax
# Get predictions for this track
= pred_human[:, track_idx]
human_pred = pred_altai[:, track_idx]
altai_pred
# Create scatter plot
=0.5, s=10)
ax.scatter(human_pred, altai_pred, alpha
# Add diagonal line
= min(human_pred.min(), altai_pred.min())
min_val = max(human_pred.max(), altai_pred.max())
max_val 'r--', alpha=0.5)
ax.plot([min_val, max_val], [min_val, max_val],
# Calculate correlation
= np.corrcoef(human_pred, altai_pred)[0,1]
corr
# Add labels and title
'Human Prediction')
ax.set_xlabel('Neanderthal Prediction')
ax.set_ylabel(f'{track_name}\nCorrelation: {corr:.3f}')
ax.set_title(
# Make plot square
'equal')
ax.set_aspect(
plt.tight_layout()
plt.show()
# Define tracks
= {
tracks 'DNASE:CD14-positive monocyte female': 41,
'DNASE:keratinocyte female': 42,
'CHIP:H3K27ac:keratinocyte female': 706,
'CAGE:LCL': 5110
}
# Create the plots
plot_prediction_scatters(pred_human, pred_altai, tracks)