Spaces:

raylim
/

mosaic-test

Sleeping

App Files Files Community

mosaic-test / paladin_inference.py

raylim

fix: tiling level

ca72d12 5 months ago

raw

history blame contribute delete

9.14 kB

	import csv
	import pickle # nosec
	import sys
	from argparse import ArgumentParser
	from collections import defaultdict
	from pathlib import Path
	from typing import Any, List, Optional

	import numpy
	import pandas as pd
	import numpy as np
	import torch
	from loguru import logger
	from torch.utils.data import DataLoader

	from data import SiteType, TileFeatureTensorDataset

	# Constants
	BATCH_SIZE = 8
	NUM_WORKERS = 16


	class UsageError(Exception):
	"""A UsageError is raised when there's a problem with the command-line arguments."""

	pass


	def load_model_map(model_map_path: str) -> dict[Any, Any]:
	"""Load the table mapping histologies and targets to the paladin
	model (a pickle file) that predicts that target for that cancer subtype.

	A dict is returned, mapping each histology to a table mapping a
	target to the pathname for the model that predicts it.
	"""
	models = defaultdict(dict)
	with Path(model_map_path).open() as fp:
	rdr = csv.DictReader(fp)
	for row in rdr:
	histology = row["cancer_subtype"]
	target = row["target_name"]
	model = row["model_path"]
	models[histology][target] = model
	return models


	def load_aeon_scores(df: pd.DataFrame) -> dict[str, float]:
	"""Load the output table from a single-slide Aeon run, listing Oncotree
	histologies and their confidence values.

	A dict is returned, mapping each histology to its confidence score.
	"""
	score = {}
	for _, row in df.iterrows():
	subtype = row["Cancer Subtype"]
	confidence = row["Confidence"]
	score[subtype] = confidence
	return score


	def select_histologies(aeon_scores: dict[str, float]) -> list[str]:
	"""Return the three top-scoring histologies, based on the given Aeon scores."""
	sorted_histologies = list(
	sorted([(v, k) for k, v in aeon_scores.items()], reverse=True)
	)
	return [histology for score, histology in sorted_histologies[:3]]


	def select_models(histologies: list[str], model_map: dict[Any, Any]) -> list[Any]:
	""" """
	models = []
	for histology, target, model in model_map.items():
	if histology in histologies:
	models.append((histology, target, model))
	return models


	def run_model(device, dataset, model_path: str, num_workers, batch_size) -> float:
	"""Run inference for the given embeddings and model.
	The point estimate is returned.
	"""

	logger.debug(f"[loading model {model_path}]")
	with Path(model_path).open("rb") as f:
	model = pickle.load(f) # nosec
	# model = CPU_Unpickler(f).load() # nosec
	model.to(device)
	model.eval()

	dataloader = DataLoader(
	dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers
	)

	results_df = []
	batch = next(iter(dataloader))
	with torch.no_grad():
	batch["tile_tensor"] = batch["tile_tensor"].to(device)
	outputs = model(batch)

	logits = outputs["logits"]
	# Apply softplus to ensure positive values for beta-binomial parameters
	logits = torch.nn.functional.softplus(logits) + 1.0 # enforce concavity
	point_estimates = logits_to_point_estimates(logits)

	# sample_id = batch['sample_id'][0]
	class_assignment = point_estimates[0].item()
	return class_assignment


	def logits_to_point_estimates(logits):
	# logits is a tensor of shape (batch_size, 2 * (n_clf_tasks + n_reg_tasks))
	# need to convert it to a tensor of shape (batch_size, n_clf_tasks + n_reg_tasks)
	return logits[:, ::2] / (logits[:, ::2] + logits[:, 1::2])


	def run_paladin(
	features: np.ndarray,
	aeon_results: Optional[pd.DataFrame] = None,
	histology_codes: List[str] = None,
	model_map_path: str = None,
	model_path: str = None,
	metastatic: bool = False,
	batch_size: int = BATCH_SIZE,
	num_workers: int = NUM_WORKERS,
	use_cpu: bool = False,
	):
	"""Run Paladin inference on a single slide, using the given embeddings
	and either a single model or a table mapping histologies and targets to models.
	If histology_codes is given, it is a list of OncoTree codes for the slide.
	If aeon_predictions_path is given, it is the pathname to a CSV file
	with the output of an Aeon run on the slide.
	If both are given, an error is raised.
	The output is written to the given output_path (a CSV file).
	"""

	if aeon_results is not None:
	aeon_scores = load_aeon_scores(aeon_results)
	target_histologies = select_histologies(aeon_scores)
	else:
	target_histologies = histology_codes

	# Build a dataset to feed to the model
	site = SiteType.METASTASIS if metastatic else SiteType.PRIMARY

	dataset = TileFeatureTensorDataset(
	tile_features=features,
	site_type=site,
	n_max_tiles=20000,
	)

	device = torch.device(
	"cuda" if not use_cpu and torch.cuda.is_available() else "cpu"
	)

	results = []
	if model_path:
	histology, target = "None", "None"
	try:
	score = run_model(device, dataset, model_path, num_workers, batch_size)
	results.append((histology, target, score))
	logger.info(f"histology: {histology} target: {target} score: {score}")
	except Exception as exc:
	logger.error(f"Unable to run model for {histology} target {target}\n{exc}")

	elif model_map_path:
	model_map = load_model_map(model_map_path)
	for histology in target_histologies:
	if histology not in model_map:
	logger.warning(f"Warning: no models found for {histology}")
	continue

	for target, model in sorted(model_map[histology].items()):
	try:
	score = run_model(device, dataset, model, num_workers, batch_size)
	results.append((histology, target, score))
	logger.info(
	f"histology: {histology} target: {target} score: {score}"
	)
	except Exception as exc:
	logger.error(
	f"Unable to run model for {histology} target {target}\n{exc}"
	)
	df = pd.DataFrame(results, columns=["Cancer Subtype", "Biomarker", "Score"])
	return df


	def parse_args():
	parser = ArgumentParser(description="Run Paladin inference on a single slide")
	parser.add_argument(
	"-i",
	"--features-path",
	required=True,
	help="Pathname to a .pt file with optimus embeddings for this slide",
	)
	parser.add_argument(
	"-o",
	"--output-path",
	help="The filename for the Paladin predictions file (CSV)",
	required=True,
	)
	parser.add_argument(
	"-c",
	"--histology-codes",
	help="One or more histologies (OncoTree codes, comma-separated)",
	)
	parser.add_argument(
	"-a",
	"--aeon-predictions-path",
	help="Pathname to an aeon-predictions file (CSV) for this slide",
	)
	parser.add_argument(
	"-mm",
	"--model-map-path",
	help="A CSV file mapping histologies and targets to Paladin models (.pkl files). Contains columns 'histology', 'target_name', and 'model_path'.",
	)
	parser.add_argument(
	"-m",
	"--model-path",
	help="The filename for a Paladin model to run inference with",
	)
	parser.add_argument(
	"--metastatic", action="store_true", help="Tissue is from a metastatic site"
	)
	parser.add_argument("--batch-size", type=int, default=BATCH_SIZE, help="Batch size")
	parser.add_argument(
	"--num-workers",
	type=int,
	default=NUM_WORKERS,
	help="Number of workers for data loading",
	)
	parser.add_argument("--use-cpu", action="store_true", help="Use CPU")
	opt = parser.parse_args()

	if opt.histology_codes and opt.aeon_predictions_path:
	raise UsageError(
	"You may specify either --codes or --aeon-predictions-path, but not both."
	)

	if opt.histology_codes:
	opt.histology_codes = opt.histology_codes.split(",")

	if opt.model_path is None and opt.model_map_path is None:
	raise UsageError("You must specify either --model-path or --model-map-path")

	return opt


	def main():
	opt = parse_args()
	features = torch.load(opt.features_path)
	logger.info(f"Loaded features from {opt.features_path}")
	aeon_results = None
	if opt.aeon_predictions_path:
	aeon_results = pd.read_csv(opt.aeon_predictions_path)
	logger.info(f"Loaded Aeon results from {opt.aeon_predictions_path}")
	df = run_paladin(
	features=features,
	aeon_results=aeon_results,
	histology_codes=opt.histology_codes,
	model_map_path=opt.model_map_path,
	model_path=opt.model_path,
	metastatic=opt.metastatic,
	batch_size=opt.batch_size,
	num_workers=opt.num_workers,
	use_cpu=opt.use_cpu,
	)
	df.to_csv(opt.output_path, index=False)
	logger.info(f"Wrote {opt.output_path}")


	if __name__ == "__main__":
	main()