FuseMap Tutorial IV: Mapping to new datasets

Contents

FuseMap Tutorial IV: Mapping to new datasets#

In this tutorial, we will demonstrate how to map new spatial transcriptomics data to an existing FuseMap integration. This is useful when you want to analyze new samples in the context of previously integrated datasets.

FuseMap provides functionality to project new data points into the same latent space as the reference integration, allowing you to:

Compare new samples to existing integrated data
Transfer annotations and insights from the reference to new data
Analyze spatial patterns across old and new datasets together

We use the integrated model trained on merfish and starmap, and then transfer the model to slideseq.

MERFISH data: Zhang et al. Nature paper
STARmap data: Shi et al., Nature paper
Slide-seq data: Langlieb et al., Nature paper

1. Define arguments#

[1]:

import warnings
warnings.filterwarnings("ignore")

[2]:

import os
import scanpy as sc
from easydict import EasyDict as edict
from fusemap import seed_all, spatial_map
import copy
seed_all(0)

[3]:

pretrain_model_path = "/n/netscratch/nali_lab_seas/Everyone/mingze/FuseMap_imputation/workspace/integrate_merfish_starmap"
output_save_dir = "/n/netscratch/nali_lab_seas/Everyone/mingze/FuseMap_imputation/workspace/map_slideseq"
args = edict(dict(pretrain_model_path=pretrain_model_path,
                  output_save_dir=output_save_dir,
                  use_llm_gene_embedding="false",
                  keep_celltype='',
                  keep_tissueregion='',
                  ))
data_dir_list = ["/n/home11/mingzeyuan/FuseMap/data/02_imaging_sequencing_data/sequencing_test_data/slideseq_Puck34.h5ad"]

2. Data loading and pre-processing#

[5]:

X_input = []
for ind, data_dir in enumerate(data_dir_list):
    print(f"Loading {data_dir}")
    data = sc.read_h5ad(data_dir)
    # Handle spatial coordinates
    if "x" not in data.obs.columns:
        if "col" in data.obs.columns and "row" in X.obs.columns:
            data.obs["x"] = data.obs["col"]
            data.obs["y"] = data.obs["row"]
        elif "spatial" in data.obsm.keys():
            data.obs["x"] = data.obsm["spatial"][:,0]
            data.obs["y"] = data.obsm["spatial"][:,1]
        elif 'Raw_Slideseq_X' in data.obs.columns:
            data.obs['x'] = data.obs['Raw_Slideseq_X']
            data.obs['y'] = data.obs['Raw_Slideseq_Y']
        else:
            raise ValueError(f"Please provide spatial coordinates in the obs['x'] and obs['y'] columns for {data_dir}")

    # Add metadata
    data.obs['name'] = f'section{ind}'
    data.obs['file_name'] = os.path.basename(data_dir)
    print(f"Loaded {data.shape[0]} cells with {data.shape[1]} genes from {data.obs['file_name'].iloc[0]}")
    X_input.append(data)

# Set parameters for integration
kneighbor = ["delaunay"] * len(X_input)
input_identity = ["ST"] * len(X_input)
print(f"Loaded {len(X_input)} datasets")

Loading /n/home11/mingzeyuan/FuseMap/data/02_imaging_sequencing_data/sequencing_test_data/slideseq_Puck34.h5ad
Loaded 72542 cells with 20543 genes from slideseq_Puck34.h5ad
Loaded 1 datasets

3. Mapping fusemap to new datasets#

[5]:

for i in range(len(X_input)):
    args_i=copy.copy(args)
    args_i.output_save_dir = os.path.join(args.output_save_dir, X_input[i].obs['file_name'].unique()[0])
    spatial_map([X_input[i]], args_i, [kneighbor[i]], [input_identity[i]])

4. Transferring cell types#

[6]:

output_dir = os.path.join(args.output_save_dir, X_input[0].obs['file_name'].unique()[0])

[ ]:

from fusemap.utils import NNTransferTrain, NNTransfer, NNTransferPredictWithUncertainty
import torch
from sklearn import preprocessing
from torch.utils.data import DataLoader, TensorDataset
from torch.utils.data import random_split
from torch import optim, nn
import sklearn
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

# open cell type embedding
ad_fusemap_emb = sc.read_h5ad(os.path.join(pretrain_model_path, 'ad_celltype_embedding.h5ad'))
ad_fusemap_transfer = sc.read_h5ad(os.path.join(output_dir, 'ad_celltype_embedding.h5ad'))
cell_type_index = 'gt_cell_type_main'

[8]:

ad_embed_train = ad_fusemap_emb[ad_fusemap_emb.obs.loc[ad_fusemap_emb.obs[cell_type_index]!='nan'].index]
ad_embed_train = ad_embed_train[ad_embed_train.obs[cell_type_index]!='Unannotated',:]

[9]:

sample1_embeddings = ad_embed_train.X
sample1_labels = list(ad_embed_train.obs[cell_type_index])

le = preprocessing.LabelEncoder()
le.fit(sample1_labels)

sample1_labels = le.transform(sample1_labels)
sample1_labels = sample1_labels.astype('str').astype('int')

dataset1 = TensorDataset(torch.Tensor(sample1_embeddings), torch.Tensor(sample1_labels).long())
train_size = int(0.8 * len(dataset1))  # Use 80% of the data for training
val_size = len(dataset1) - train_size
train_dataset, val_dataset = random_split(dataset1, [train_size, val_size])

val_size = int(0.5 * len(val_dataset))  # Use 10% of the data for val and 10% for testing
test_size = len(val_dataset) - val_size
val_dataset, test_dataset = random_split(val_dataset, [val_size, test_size])

[10]:

train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=256, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=256, shuffle=False)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
class_weight = torch.Tensor(sklearn.utils.class_weight.compute_class_weight(class_weight='balanced',
                                                                            classes=np.unique(sample1_labels),
                                                                            y=sample1_labels))
model = NNTransfer(input_dim=sample1_embeddings.shape[1],
                   output_dim=len(np.unique(sample1_labels)))
model.to(device)  # Move the model to GPU if available
criterion = nn.CrossEntropyLoss(weight=class_weight.to(device))
optimizer = optim.Adam(model.parameters(), lr=0.001)

NNTransferTrain(model, criterion, optimizer, train_loader, val_loader, device)

Epoch 0/200 - Train Loss: 2.8635384130083823, Accuracy: 90.04910829671749
Epoch 1/200 - Train Loss: 2.5025686803928093, Accuracy: 90.79865598345826
Epoch 12/200 - early stopping due to patience count

[11]:

sample2_embeddings = ad_fusemap_transfer.X
dataset2 = TensorDataset(torch.Tensor(sample2_embeddings))
dataloader2 = DataLoader(dataset2, batch_size=256, shuffle=False)
sample2_predictions, sample2_uncertainty = NNTransferPredictWithUncertainty(model, dataloader2, device)
sample2_predictions = le.inverse_transform(sample2_predictions)

ad_fusemap_transfer.obs['transfer_gt_cell_type_main'] = sample2_predictions

[12]:

# ad_fusemap_new = ad_fusemap_transfer[ad_fusemap_emb.obs['batch'].isin(['sample0'])]
cell_types = sorted(ad_fusemap_transfer.obs['transfer_gt_cell_type_main'].unique())

# Generate unique colors using a continuous colormap
num_colors = len(cell_types)
cmap = plt.get_cmap('gist_rainbow', num_colors)  # 'gist_rainbow' ensures distinct colors
colors = [cmap(i / num_colors) for i in range(num_colors)]

# Create a dictionary mapping tissue types to colors
colormap = dict(zip(cell_types, colors))

[13]:

import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (8, 8)
plt.rcParams['figure.dpi'] = 300

# Get coordinates for both samples
x1 = pd.to_numeric(ad_embed_train.obs['x'], errors='coerce')
y1 = pd.to_numeric(ad_embed_train.obs['y'], errors='coerce')
x2 = pd.to_numeric(ad_fusemap_transfer.obs['x'], errors='coerce')
y2 = pd.to_numeric(ad_fusemap_transfer.obs['y'], errors='coerce')

# Calculate centroids
centroid1 = (np.mean(x1), np.mean(y1))
centroid2 = (np.mean(x2), np.mean(y2))

# Center both point sets
x1_centered = x1 - centroid1[0]
y1_centered = y1 - centroid1[1]
x2_centered = x2 - centroid2[0]
y2_centered = y2 - centroid2[1]

# Calculate scale factors to normalize both point sets
scale1 = np.sqrt(np.mean(x1_centered**2 + y1_centered**2))
scale2 = np.sqrt(np.mean(x2_centered**2 + y2_centered**2))

# Scale points to have same spread
x1_normalized = x1_centered / scale1
y1_normalized = y1_centered / scale1
x2_normalized = x2_centered / scale2
y2_normalized = y2_centered / scale2

# Calculate principal components for rotation alignment
coords1 = np.column_stack((x1_normalized, y1_normalized))
coords2 = np.column_stack((x2_normalized, y2_normalized))

# Create a figure with subplots for each tissue type (2 plots per tissue)
n_tissues = len(cell_types)
n_cols = 4
n_rows = (2 * n_tissues + n_cols - 1) // n_cols  # Double the rows since we need 2 plots per tissue
fig, axes = plt.subplots(n_rows, n_cols, figsize=(20, 5*n_rows), dpi=300)
axes = axes.flatten()

# Plot each tissue type separately in consecutive subplots
for idx, tissue in enumerate(cell_types):
    # First subplot for Sample 1
    ax1 = axes[2*idx]
    mask1 = ad_embed_train.obs['gt_cell_type_main'] == tissue
    # Plot other cells with transparency
    ax1.scatter(coords1[~mask1, 0], coords1[~mask1, 1], s=0.3,
              c='gray', alpha=0.1)
    # Plot cells of current tissue type
    ax1.scatter(coords1[mask1, 0], coords1[mask1, 1], s=0.3,
              c=[colormap[tissue]], label='Sample 1')
    ax1.set_title(f'{tissue} - Reference')
    ax1.axis('off')

    # Second subplot for Sample 2
    ax2 = axes[2*idx + 1]
    mask2 = ad_fusemap_transfer.obs['transfer_gt_cell_type_main'] == tissue
    # Plot other cells with transparency
    ax2.scatter(coords2[~mask2, 0], coords2[~mask2, 1], s=0.3,
              c='gray', alpha=0.1)
    # Plot cells of current tissue type
    ax2.scatter(coords2[mask2, 0], coords2[mask2, 1], s=0.3,
              c=[colormap[tissue]], label='Sample 2')
    ax2.set_title(f'{tissue} - Transferred')
    ax2.axis('off')
    # ax2.invert_xaxis()
    # ax2.invert_yaxis()

# Remove any empty subplots
for idx in range(2*n_tissues, len(axes)):
    fig.delaxes(axes[idx])

plt.tight_layout()
# plt.savefig('mouse_embryo_trasnsfer_by_tissue.png', dpi=300, bbox_inches='tight')
plt.show()

../_images/notebooks_4_map_new_dataset_customized_18_0.png

5. Transferring region annotations#

The method is the same with transferring cell types, so we only need to modify the annotations to region-based ones.

[17]:

# open cell type embedding
ad_fusemap_emb = sc.read_h5ad(os.path.join(pretrain_model_path, 'ad_celltype_embedding.h5ad'))
ad_fusemap_transfer = sc.read_h5ad(os.path.join(output_dir, 'ad_celltype_embedding.h5ad'))
region_index = 'gtTissueRegion'

ad_embed_train = ad_fusemap_emb[ad_fusemap_emb.obs.loc[ad_fusemap_emb.obs[region_index]!='nan'].index]
ad_embed_train = ad_embed_train[ad_embed_train.obs[region_index]!='Unannotated',:]

sample1_embeddings = ad_embed_train.X
sample1_labels = list(ad_embed_train.obs[region_index])

le = preprocessing.LabelEncoder()
le.fit(sample1_labels)

sample1_labels = le.transform(sample1_labels)
sample1_labels = sample1_labels.astype('str').astype('int')

dataset1 = TensorDataset(torch.Tensor(sample1_embeddings), torch.Tensor(sample1_labels).long())
train_size = int(0.8 * len(dataset1))  # Use 80% of the data for training
val_size = len(dataset1) - train_size
train_dataset, val_dataset = random_split(dataset1, [train_size, val_size])

val_size = int(0.5 * len(val_dataset))  # Use 10% of the data for val and 10% for testing
test_size = len(val_dataset) - val_size
val_dataset, test_dataset = random_split(val_dataset, [val_size, test_size])

train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=256, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=256, shuffle=False)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
class_weight = torch.Tensor(sklearn.utils.class_weight.compute_class_weight(class_weight='balanced',
                                                                            classes=np.unique(sample1_labels),
                                                                            y=sample1_labels))
model = NNTransfer(input_dim=sample1_embeddings.shape[1],
                   output_dim=len(np.unique(sample1_labels)))
model.to(device)  # Move the model to GPU if available
criterion = nn.CrossEntropyLoss(weight=class_weight.to(device))
optimizer = optim.Adam(model.parameters(), lr=0.001)

NNTransferTrain(model, criterion, optimizer, train_loader, val_loader, device)

sample2_embeddings = ad_fusemap_transfer.X
dataset2 = TensorDataset(torch.Tensor(sample2_embeddings))
dataloader2 = DataLoader(dataset2, batch_size=256, shuffle=False)
sample2_predictions, sample2_uncertainty = NNTransferPredictWithUncertainty(model, dataloader2, device)
sample2_predictions = le.inverse_transform(sample2_predictions)

ad_fusemap_transfer.obs['transfer_gt_region_main'] = sample2_predictions

# ad_fusemap_new = ad_fusemap_transfer[ad_fusemap_emb.obs['batch'].isin(['sample0'])]
regions = sorted(ad_fusemap_transfer.obs['transfer_gt_region_main'].unique())

# Generate unique colors using a continuous colormap
num_colors = len(regions)
cmap = plt.get_cmap('gist_rainbow', num_colors)  # 'gist_rainbow' ensures distinct colors
colors = [cmap(i / num_colors) for i in range(num_colors)]

# Create a dictionary mapping tissue types to colors
colormap = dict(zip(regions, colors))

Epoch 0/200 - Train Loss: 2.7291942955791084, Accuracy: 67.96919243402424
Epoch 1/200 - Train Loss: 2.5133547610130864, Accuracy: 71.0159700985389
Epoch 2/200 - Train Loss: 2.4860980381136355, Accuracy: 71.02729640955941
Epoch 3/200 - Train Loss: 2.476980630038441, Accuracy: 71.62759089364594
Epoch 14/200 - Train Loss: 2.4359164497126704, Accuracy: 71.68422244874844
Epoch 16/200 - Train Loss: 2.427447496117025, Accuracy: 72.01268546834297
Epoch 21/200 - Train Loss: 2.415807494218799, Accuracy: 72.05799071242497
Epoch 32/200 - Train Loss: 2.402252415815989, Accuracy: 72.13727488956847
Epoch 34/200 - Train Loss: 2.399995523950328, Accuracy: 72.14860120058897
Epoch 37/200 - Train Loss: 2.397495036539824, Accuracy: 72.34114848793747
Epoch 44/200 - Train Loss: 2.39161859733471, Accuracy: 72.40910635406048
Epoch 46/200 - Train Loss: 2.3898750794106633, Accuracy: 72.49971684222449
Epoch 49/200 - Train Loss: 2.3876315286194068, Accuracy: 72.51104315324498
Epoch 51/200 - Train Loss: 2.38583548259044, Accuracy: 72.6129799524295
Epoch 53/200 - Train Loss: 2.38467036900313, Accuracy: 72.6809378185525
Epoch 54/200 - Train Loss: 2.383786117685014, Accuracy: 72.714916751614
Epoch 56/200 - Train Loss: 2.3824867338374043, Accuracy: 72.82817986181901
Epoch 58/200 - Train Loss: 2.381176597830178, Accuracy: 72.98674821610601
Epoch 65/200 - Train Loss: 2.3766789902811465, Accuracy: 73.06603239324951
Epoch 68/200 - Train Loss: 2.3749509661094, Accuracy: 73.21327443651603
Epoch 71/200 - Train Loss: 2.373402103997659, Accuracy: 73.26990599161853
Epoch 77/200 - Train Loss: 2.3700081602386804, Accuracy: 73.30388492468003
Epoch 80/200 - Train Loss: 2.368684012820755, Accuracy: 73.32653754672103
Epoch 82/200 - Train Loss: 2.3674023972041365, Accuracy: 73.37184279080303
Epoch 84/200 - Train Loss: 2.36673640254615, Accuracy: 73.53041114509004
Epoch 95/200 - early stopping due to patience count

[ ]:

plt.rcParams['figure.figsize'] = (8, 8)
plt.rcParams['figure.dpi'] = 300

# Get coordinates for both samples
x1 = pd.to_numeric(ad_embed_train.obs['x'], errors='coerce')
y1 = pd.to_numeric(ad_embed_train.obs['y'], errors='coerce')
x2 = pd.to_numeric(ad_fusemap_transfer.obs['x'], errors='coerce')
y2 = pd.to_numeric(ad_fusemap_transfer.obs['y'], errors='coerce')

# Calculate centroids
centroid1 = (np.mean(x1), np.mean(y1))
centroid2 = (np.mean(x2), np.mean(y2))

# Center both point sets
x1_centered = x1 - centroid1[0]
y1_centered = y1 - centroid1[1]
x2_centered = x2 - centroid2[0]
y2_centered = y2 - centroid2[1]

# Calculate scale factors to normalize both point sets
scale1 = np.sqrt(np.mean(x1_centered**2 + y1_centered**2))
scale2 = np.sqrt(np.mean(x2_centered**2 + y2_centered**2))

# Scale points to have same spread
x1_normalized = x1_centered / scale1
y1_normalized = y1_centered / scale1
x2_normalized = x2_centered / scale2
y2_normalized = y2_centered / scale2

# Calculate principal components for rotation alignment
coords1 = np.column_stack((x1_normalized, y1_normalized))
coords2 = np.column_stack((x2_normalized, y2_normalized))

# Create a figure with subplots for each tissue type (2 plots per tissue)
n_tissues = len(regions)
n_cols = 4
n_rows = (2 * n_tissues + n_cols - 1) // n_cols  # Double the rows since we need 2 plots per tissue
fig, axes = plt.subplots(n_rows, n_cols, figsize=(20, 5*n_rows), dpi=300)
axes = axes.flatten()

# Plot each tissue type separately in consecutive subplots
for idx, tissue in enumerate(regions):
    # First subplot for Sample 1
    ax1 = axes[2*idx]
    mask1 = ad_embed_train.obs[region_index] == tissue
    # Plot other cells with transparency
    ax1.scatter(coords1[~mask1, 0], coords1[~mask1, 1], s=0.3,
              c='gray', alpha=0.1)
    # Plot cells of current tissue type
    ax1.scatter(coords1[mask1, 0], coords1[mask1, 1], s=0.3,
              c=[colormap[tissue]], label='Sample 1')
    ax1.set_title(f'{tissue} - Reference')
    ax1.axis('off')

    # Second subplot for Sample 2
    ax2 = axes[2*idx + 1]
    mask2 = ad_fusemap_transfer.obs['transfer_gt_region_main'] == tissue
    # Plot other cells with transparency
    ax2.scatter(coords2[~mask2, 0], coords2[~mask2, 1], s=0.3,
              c='gray', alpha=0.1)
    # Plot cells of current tissue type
    ax2.scatter(coords2[mask2, 0], coords2[mask2, 1], s=0.3,
              c=[colormap[tissue]], label='Sample 2')
    ax2.set_title(f'{tissue} - Transferred')
    ax2.axis('off')
    # ax2.invert_xaxis()
    # ax2.invert_yaxis()

# Remove any empty subplots
for idx in range(2*n_tissues, len(axes)):
    fig.delaxes(axes[idx])

plt.tight_layout()
# plt.savefig('mouse_embryo_trasnsfer_by_tissue.png', dpi=300, bbox_inches='tight')
plt.show()