import csv import pickle # nosec import sys from argparse import ArgumentParser from collections import defaultdict from pathlib import Path from typing import Any, List, Optional import numpy import pandas as pd import numpy as np import torch from loguru import logger from torch.utils.data import DataLoader from data import SiteType, TileFeatureTensorDataset # Constants BATCH_SIZE = 8 NUM_WORKERS = 16 class UsageError(Exception): """A UsageError is raised when there's a problem with the command-line arguments.""" pass def load_model_map(model_map_path: str) -> dict[Any, Any]: """Load the table mapping histologies and targets to the paladin model (a pickle file) that predicts that target for that cancer subtype. A dict is returned, mapping each histology to a table mapping a target to the pathname for the model that predicts it. """ models = defaultdict(dict) with Path(model_map_path).open() as fp: rdr = csv.DictReader(fp) for row in rdr: histology = row["cancer_subtype"] target = row["target_name"] model = row["model_path"] models[histology][target] = model return models def load_aeon_scores(df: pd.DataFrame) -> dict[str, float]: """Load the output table from a single-slide Aeon run, listing Oncotree histologies and their confidence values. A dict is returned, mapping each histology to its confidence score. """ score = {} for _, row in df.iterrows(): subtype = row["Cancer Subtype"] confidence = row["Confidence"] score[subtype] = confidence return score def select_histologies(aeon_scores: dict[str, float]) -> list[str]: """Return the three top-scoring histologies, based on the given Aeon scores.""" sorted_histologies = list( sorted([(v, k) for k, v in aeon_scores.items()], reverse=True) ) return [histology for score, histology in sorted_histologies[:3]] def select_models(histologies: list[str], model_map: dict[Any, Any]) -> list[Any]: """ """ models = [] for histology, target, model in model_map.items(): if histology in histologies: models.append((histology, target, model)) return models def run_model(device, dataset, model_path: str, num_workers, batch_size) -> float: """Run inference for the given embeddings and model. The point estimate is returned. """ logger.debug(f"[loading model {model_path}]") with Path(model_path).open("rb") as f: model = pickle.load(f) # nosec # model = CPU_Unpickler(f).load() # nosec model.to(device) model.eval() dataloader = DataLoader( dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers ) results_df = [] batch = next(iter(dataloader)) with torch.no_grad(): batch["tile_tensor"] = batch["tile_tensor"].to(device) outputs = model(batch) logits = outputs["logits"] # Apply softplus to ensure positive values for beta-binomial parameters logits = torch.nn.functional.softplus(logits) + 1.0 # enforce concavity point_estimates = logits_to_point_estimates(logits) # sample_id = batch['sample_id'][0] class_assignment = point_estimates[0].item() return class_assignment def logits_to_point_estimates(logits): # logits is a tensor of shape (batch_size, 2 * (n_clf_tasks + n_reg_tasks)) # need to convert it to a tensor of shape (batch_size, n_clf_tasks + n_reg_tasks) return logits[:, ::2] / (logits[:, ::2] + logits[:, 1::2]) def run_paladin( features: np.ndarray, aeon_results: Optional[pd.DataFrame] = None, histology_codes: List[str] = None, model_map_path: str = None, model_path: str = None, metastatic: bool = False, batch_size: int = BATCH_SIZE, num_workers: int = NUM_WORKERS, use_cpu: bool = False, ): """Run Paladin inference on a single slide, using the given embeddings and either a single model or a table mapping histologies and targets to models. If histology_codes is given, it is a list of OncoTree codes for the slide. If aeon_predictions_path is given, it is the pathname to a CSV file with the output of an Aeon run on the slide. If both are given, an error is raised. The output is written to the given output_path (a CSV file). """ if aeon_results is not None: aeon_scores = load_aeon_scores(aeon_results) target_histologies = select_histologies(aeon_scores) else: target_histologies = histology_codes # Build a dataset to feed to the model site = SiteType.METASTASIS if metastatic else SiteType.PRIMARY dataset = TileFeatureTensorDataset( tile_features=features, site_type=site, n_max_tiles=20000, ) device = torch.device( "cuda" if not use_cpu and torch.cuda.is_available() else "cpu" ) results = [] if model_path: histology, target = "None", "None" try: score = run_model(device, dataset, model_path, num_workers, batch_size) results.append((histology, target, score)) logger.info(f"histology: {histology} target: {target} score: {score}") except Exception as exc: logger.error(f"Unable to run model for {histology} target {target}\n{exc}") elif model_map_path: model_map = load_model_map(model_map_path) for histology in target_histologies: if histology not in model_map: logger.warning(f"Warning: no models found for {histology}") continue for target, model in sorted(model_map[histology].items()): try: score = run_model(device, dataset, model, num_workers, batch_size) results.append((histology, target, score)) logger.info( f"histology: {histology} target: {target} score: {score}" ) except Exception as exc: logger.error( f"Unable to run model for {histology} target {target}\n{exc}" ) df = pd.DataFrame(results, columns=["Cancer Subtype", "Biomarker", "Score"]) return df def parse_args(): parser = ArgumentParser(description="Run Paladin inference on a single slide") parser.add_argument( "-i", "--features-path", required=True, help="Pathname to a .pt file with optimus embeddings for this slide", ) parser.add_argument( "-o", "--output-path", help="The filename for the Paladin predictions file (CSV)", required=True, ) parser.add_argument( "-c", "--histology-codes", help="One or more histologies (OncoTree codes, comma-separated)", ) parser.add_argument( "-a", "--aeon-predictions-path", help="Pathname to an aeon-predictions file (CSV) for this slide", ) parser.add_argument( "-mm", "--model-map-path", help="A CSV file mapping histologies and targets to Paladin models (.pkl files). Contains columns 'histology', 'target_name', and 'model_path'.", ) parser.add_argument( "-m", "--model-path", help="The filename for a Paladin model to run inference with", ) parser.add_argument( "--metastatic", action="store_true", help="Tissue is from a metastatic site" ) parser.add_argument("--batch-size", type=int, default=BATCH_SIZE, help="Batch size") parser.add_argument( "--num-workers", type=int, default=NUM_WORKERS, help="Number of workers for data loading", ) parser.add_argument("--use-cpu", action="store_true", help="Use CPU") opt = parser.parse_args() if opt.histology_codes and opt.aeon_predictions_path: raise UsageError( "You may specify either --codes or --aeon-predictions-path, but not both." ) if opt.histology_codes: opt.histology_codes = opt.histology_codes.split(",") if opt.model_path is None and opt.model_map_path is None: raise UsageError("You must specify either --model-path or --model-map-path") return opt def main(): opt = parse_args() features = torch.load(opt.features_path) logger.info(f"Loaded features from {opt.features_path}") aeon_results = None if opt.aeon_predictions_path: aeon_results = pd.read_csv(opt.aeon_predictions_path) logger.info(f"Loaded Aeon results from {opt.aeon_predictions_path}") df = run_paladin( features=features, aeon_results=aeon_results, histology_codes=opt.histology_codes, model_map_path=opt.model_map_path, model_path=opt.model_path, metastatic=opt.metastatic, batch_size=opt.batch_size, num_workers=opt.num_workers, use_cpu=opt.use_cpu, ) df.to_csv(opt.output_path, index=False) logger.info(f"Wrote {opt.output_path}") if __name__ == "__main__": main()