Protein Is a Point¶
Use embeddings as feature vectors for similarity search, outlier detection, and landscape visualization.
⚠️ Preview Feature — The
biolmai.pipelinemodule used in this guide is currently in preview and not yet publicly released. Access is available to early users on request. Contact us to get access.
What you'll learn:
- Tool 1: "Find me more like this one" — cosine similarity search
- Tool 2: "Is this sequence weird?" — outlier detection
- Tool 3: "Show me the landscape" — PCA colored by predicted property
Requirements:
pip install biolmai[pipeline] matplotlib numpy scikit-learn
export BIOLMAI_TOKEN=your-token-here
Setup¶
import os
from biolmai.pipeline import (
DataPipeline, DuckDBDataStore,
ThresholdFilter, RankingFilter,
ValidAminoAcidFilter, EmbeddingSpec,
DiversitySamplingFilter,
)
TOKEN = os.environ.get("BIOLMAI_TOKEN", "")
if not TOKEN:
raise EnvironmentError(
"Set BIOLMAI_TOKEN before running.\n"
"Get one at https://biolm.ai/ui/accounts/user-api-tokens/"
)
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCAGet embeddings for a peptide library¶
LIBRARY = [
"GIGKFLHSAKKFGKAFVGEIMNS", "GLFDIIKKIAESF", "KLAKLAKKLAKLAK",
"RRWWRRWWRR", "KWKLFKKI", "GIGKFLHSAK", "RLFDKIRQ",
"GLFDIVKKVVGALGSL", "FLPLILRKIVTAL", "KWKWKWKWKW",
"GIKKFLGSIWKFIKAFVKEIMN", "RRLCRIVVIRVCR", "RRWQWR",
"RWRWRW", "FKRIVQRIKDFL", "KWKLFKKIPKFLHLAK",
"GLFDIIKKIAESFLPKV", "GIGKFLHSAKKFGKAFV", "KWKLFKKIPKFLHLAKKF",
"LLGDFFRKSKEKIGKEFKRIVQRIKDFLRNLVPRTES",
]
HIT_SEQUENCE = LIBRARY[0] # our "hit" from a previous screen
ds = DuckDBDataStore("embedding_demo.duckdb")
pipeline = DataPipeline(sequences=LIBRARY, datastore=ds, verbose=True)
pipeline.add_prediction(
"esm2-8m", action="encode",
embedding_extractor=EmbeddingSpec(key="embeddings", layer=6),
stage_name="embed",
)
pipeline.run()
sequence_ids = [r[0] for r in ds.conn.execute("SELECT sequence_id FROM sequences ORDER BY sequence_id").fetchall()]
sequences = [r[0] for r in ds.conn.execute("SELECT sequence FROM sequences ORDER BY sequence_id").fetchall()]
emb_map = ds.get_embeddings_bulk(sequence_ids, model_name="esm2-8m")
mat = np.stack([emb_map[sid] for sid in sequence_ids])
print(f"Embedding matrix: {mat.shape}")Tool 1: Find me more like this one¶
Cosine similarity on embeddings gives a ranked list of nearest neighbors.
hit_idx = sequences.index(HIT_SEQUENCE)
hit_vec = mat[hit_idx]
hit_norm = hit_vec / np.linalg.norm(hit_vec)
sims = []
for i, sid in enumerate(sequence_ids):
if i == hit_idx:
continue
v = mat[i]
sim = np.dot(hit_norm, v / np.linalg.norm(v))
sims.append((sim, sequences[i]))
sims.sort(reverse=True)
print(f"Query: {HIT_SEQUENCE}\n")
print("Top 5 most similar sequences:")
for sim, seq in sims[:5]:
print(f" {sim:.3f} {seq}")Tool 2: Is this sequence weird?¶
Distance from the centroid flags outliers — contamination, misannotations, or genuinely novel variants.
centroid = mat.mean(axis=0)
distances = np.linalg.norm(mat - centroid, axis=1)
threshold = distances.mean() + 2 * distances.std()
print(f"Mean distance: {distances.mean():.3f}")
print(f"Outlier threshold (mean + 2σ): {threshold:.3f}\n")
outliers = [(dist, seq) for dist, seq in zip(distances, sequences) if dist > threshold]
if outliers:
print("Outlier sequences:")
for dist, seq in sorted(outliers, reverse=True):
print(f" distance={dist:.3f} {seq}")
else:
print("No outliers detected")Tool 3: Show me the landscape¶
PCA gives a 2D map of the library. Color by any property to see functional structure.
# First get Tm predictions for coloring
pipeline2 = DataPipeline(sequences=LIBRARY, datastore=ds, verbose=True)
pipeline2.add_prediction("temperature-regression", extractions="prediction", columns="melting_temperature")
pipeline2.run()
tm_rows = ds.conn.execute("""
SELECT s.sequence_id, p.value AS tm
FROM sequences s JOIN predictions p ON s.sequence_id = p.sequence_id
WHERE p.prediction_type = 'melting_temperature'
ORDER BY s.sequence_id
""").df()
tm_values = tm_rows["tm"].values
pca = PCA(n_components=2)
coords = pca.fit_transform(mat)
fig, axes = plt.subplots(1, 2, figsize=(13, 5))
# Left: colored by Tm
sc = axes[0].scatter(coords[:, 0], coords[:, 1], c=tm_values,
cmap="coolwarm", edgecolors="black", linewidth=0.3, alpha=0.85, s=60)
plt.colorbar(sc, ax=axes[0], label="Predicted Tm (°C)")
axes[0].set_xlabel(f"PC1 ({pca.explained_variance_ratio_[0]:.1%})")
axes[0].set_ylabel(f"PC2 ({pca.explained_variance_ratio_[1]:.1%})")
axes[0].set_title("Library landscape — colored by predicted Tm")
# Right: highlight outliers
is_outlier = distances > threshold
axes[1].scatter(coords[~is_outlier, 0], coords[~is_outlier, 1],
c="steelblue", edgecolors="black", linewidth=0.3, alpha=0.7, s=60, label="Normal")
axes[1].scatter(coords[is_outlier, 0], coords[is_outlier, 1],
c="crimson", edgecolors="black", linewidth=0.5, s=90, zorder=5, label="Outlier")
axes[1].set_title("Outlier detection (>2σ from centroid)")
axes[1].legend()
plt.tight_layout()
plt.show()Cleanup¶
ds.close()
import os; os.remove("embedding_demo.duckdb")