Preranked GSEA

Contents

Preranked GSEA#

Setup#

Download a MEME motif database file containing human transcription factor binding motifs from the Cis-BP resource, alongside the human genome in fasta format.

%%bash
[ -f data/cisBP_human.meme ] || wget -O data/cisBP_human.meme https://osf.io/download/uk6vn
[ -f data/GRCh38.fa ] || wget -O data/GRCh38.fa.gz https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_38/GRCh38.p13.genome.fa.gz
[ -f data/GRCh38.fa ] || gunzip -c data/GRCh38.fa.gz > data/GRCh38.fa

import gatac as ga
import scanpy as sc
import anndata as ad
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

sc.set_figure_params(dpi=100, frameon=False)

MOTIF_FILE = "data/cisBP_human.meme"
peak_adata = ad.read_h5ad("data/pbmc_peaks.h5ad")
# Get marker peaks for cluster 0
markers = ga.tl.marker_peaks(peak_adata, "leiden")
print("Available clusters:", list(markers.keys()))

43:10 - gatac.tl.markers - INFO - Running marker peak detection for groups in 'leiden'
43:10 - gatac.tl.markers - INFO - Testing 13 groups, 178,350 features, 5,001 cells
43:10 - gatac.tl.markers - INFO - Using max_cells=500 per group for balanced comparison
43:12 - gatac.tl.markers - INFO - Marker peak detection complete. Results stored in adata.uns['marker_peaks']

Available clusters: ['0', '1', '10', '11', '12', '2', '3', '4', '5', '6', '7', '8', '9']

1. Prepare ranked peak list#

# Use cluster 0 markers ranked by log2 fold-change
cluster = "0"
marker_df = markers[cluster].sort_values("log2_fc", ascending=False)

# Build a DataFrame indexed by peak name with the expected column name
rankings = marker_df.set_index("feature")[["log2_fc"]].rename(columns={"log2_fc": "log2fc"})
print(f"Ranked {len(rankings):,} peaks for cluster {cluster}")

Ranked 5,575 peaks for cluster 0

2. Build motif gene sets#

Map each motif to the set of peaks that contain that motif.

motifs = ga.tl.read_motifs(MOTIF_FILE, unique=True)
ga.tl.scan_motifs(
    peak_adata,
    motifs=motifs,
    genome_fasta="data/GRCh38.fa",
)
motif_names = [m.name for m in motifs]
# Build gene sets: motif → list of peaks containing that motif
motif_matrix = peak_adata.varm["motif_match"]
import scipy.sparse as sp
peak_names = peak_adata.var_names.tolist()
motif_terms = {
    motif_names[j]: [peak_names[i] for i in motif_matrix[:, j].nonzero()[0]]
    for j in range(motif_matrix.shape[1])
    if motif_matrix[:, j].nnz > 0
}
print(f"Built {len(motif_terms)} motif gene sets")

43:18 - gatac.tl.chromvar - INFO - Scanning 1165 motifs in 178350 peaks...
43:18 - gatac.tl.chromvar - INFO - Using 0-based coordinate system
43:18 - gatac.tl.chromvar - INFO - Fetching sequences from genome...

15:43:20 - gatac.tl.chromvar - INFO - Background from sequences: A=0.2589 C=0.2412 G=0.2411 T=0.2588
15:43:20 - gatac.tl.chromvar - INFO - Computing score thresholds...

15:43:24 - gatac.tl.chromvar - INFO - Scanning motifs on GPU in 4 batch(es) of 50000 peaks...

motif_matrix = peak_adata.varm["motif_match"]
import scipy.sparse as sp
peak_names = peak_adata.var_names.tolist()
motif_terms = {
    motif_names[j]: [peak_names[i] for i in motif_matrix[:, j].nonzero()[0]]
    for j in range(motif_matrix.shape[1])
    if motif_matrix[:, j].nnz > 0
}
print(f"Built {len(motif_terms)} motif gene sets")

Built 1165 motif gene sets

3. Run preranked GSEA#

import time

t0 = time.perf_counter()
gsea_df = ga.tl.gsea_motif_enrichment(
    peak_adata,
    rankings=rankings,
)
print(f"GSEA in {time.perf_counter() - t0:.1f}s")
gsea_df.nsmallest(10, "fdr")

24:26 - gatac.tl.motif - INFO - Built 1,165 motif feature sets (≥15 peaks; filter to ≥15)
24:26 - gatac.tl.motif - INFO - Running GSEA (gpu) on provided rankings...
24:27 - gatac.tl.gsea - INFO - GPU GSEA: 1137 feature sets, 5305 features, 1000 permutations  (gs_batch=4, perm_batch=256)
24:27 - gatac.tl.gsea - INFO - Generating permutation indices...

GSEA in 11.1s

	motif	NES	pval	fdr	lead_edge_n	set_size	lead_edge_frac
0	CEBPA	5.364148	0.0	0.0	107	315	0.339683
1	CEBPB	4.925610	0.0	0.0	66	160	0.412500
2	CEBPG	4.235527	0.0	0.0	59	129	0.457364
3	CEBPE	3.591549	0.0	0.0	41	101	0.405941
4	CEBPD	3.550723	0.0	0.0	42	113	0.371681
5	NFE2L2	3.454037	0.0	0.0	95	356	0.266854
6	BACH1	3.372003	0.0	0.0	96	360	0.266667
7	FOSL2	2.807798	0.0	0.0	86	345	0.249275
8	TEF	2.780295	0.0	0.0	33	94	0.351064
9	BACH2	2.765251	0.0	0.0	117	402	0.291045

4. GSEA enrichment plot#

top_results = gsea_df.nsmallest(20, "fdr")

fig, ax = plt.subplots(figsize=(7, 6))
colors = ["#d62728" if nes > 0 else "#1f77b4" for nes in top_results["NES"]]
bars = ax.barh(range(len(top_results)), top_results["NES"], color=colors)
ax.set_yticks(range(len(top_results)))
ax.set_yticklabels(top_results["motif"])
ax.axvline(0, color="black", lw=0.8)
ax.set_xlabel("Normalized Enrichment Score")
ax.set_title(f"Top GSEA motif results — cluster {cluster}")
plt.tight_layout()
plt.show()

../_images/3d8f36b010d94ce8eeea3946564c062472914b42ac47004fbda9fbb854fcf6f9.png

Next steps#

API: ga.tl.gsea_motif_enrichment