Spaces:
Sleeping
Sleeping
| import csv | |
| import pickle # nosec | |
| import sys | |
| from argparse import ArgumentParser | |
| from collections import defaultdict | |
| from pathlib import Path | |
| from typing import Any, List, Optional | |
| import numpy | |
| import pandas as pd | |
| import numpy as np | |
| import torch | |
| from loguru import logger | |
| from torch.utils.data import DataLoader | |
| from data import SiteType, TileFeatureTensorDataset | |
| # Constants | |
| BATCH_SIZE = 8 | |
| NUM_WORKERS = 16 | |
| class UsageError(Exception): | |
| """A UsageError is raised when there's a problem with the command-line arguments.""" | |
| pass | |
| def load_model_map(model_map_path: str) -> dict[Any, Any]: | |
| """Load the table mapping histologies and targets to the paladin | |
| model (a pickle file) that predicts that target for that cancer subtype. | |
| A dict is returned, mapping each histology to a table mapping a | |
| target to the pathname for the model that predicts it. | |
| """ | |
| models = defaultdict(dict) | |
| with Path(model_map_path).open() as fp: | |
| rdr = csv.DictReader(fp) | |
| for row in rdr: | |
| histology = row["cancer_subtype"] | |
| target = row["target_name"] | |
| model = row["model_path"] | |
| models[histology][target] = model | |
| return models | |
| def load_aeon_scores(df: pd.DataFrame) -> dict[str, float]: | |
| """Load the output table from a single-slide Aeon run, listing Oncotree | |
| histologies and their confidence values. | |
| A dict is returned, mapping each histology to its confidence score. | |
| """ | |
| score = {} | |
| for _, row in df.iterrows(): | |
| subtype = row["Cancer Subtype"] | |
| confidence = row["Confidence"] | |
| score[subtype] = confidence | |
| return score | |
| def select_histologies(aeon_scores: dict[str, float]) -> list[str]: | |
| """Return the three top-scoring histologies, based on the given Aeon scores.""" | |
| sorted_histologies = list( | |
| sorted([(v, k) for k, v in aeon_scores.items()], reverse=True) | |
| ) | |
| return [histology for score, histology in sorted_histologies[:3]] | |
| def select_models(histologies: list[str], model_map: dict[Any, Any]) -> list[Any]: | |
| """ """ | |
| models = [] | |
| for histology, target, model in model_map.items(): | |
| if histology in histologies: | |
| models.append((histology, target, model)) | |
| return models | |
| def run_model(device, dataset, model_path: str, num_workers, batch_size) -> float: | |
| """Run inference for the given embeddings and model. | |
| The point estimate is returned. | |
| """ | |
| logger.debug(f"[loading model {model_path}]") | |
| with Path(model_path).open("rb") as f: | |
| model = pickle.load(f) # nosec | |
| # model = CPU_Unpickler(f).load() # nosec | |
| model.to(device) | |
| model.eval() | |
| dataloader = DataLoader( | |
| dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers | |
| ) | |
| results_df = [] | |
| batch = next(iter(dataloader)) | |
| with torch.no_grad(): | |
| batch["tile_tensor"] = batch["tile_tensor"].to(device) | |
| outputs = model(batch) | |
| logits = outputs["logits"] | |
| # Apply softplus to ensure positive values for beta-binomial parameters | |
| logits = torch.nn.functional.softplus(logits) + 1.0 # enforce concavity | |
| point_estimates = logits_to_point_estimates(logits) | |
| # sample_id = batch['sample_id'][0] | |
| class_assignment = point_estimates[0].item() | |
| return class_assignment | |
| def logits_to_point_estimates(logits): | |
| # logits is a tensor of shape (batch_size, 2 * (n_clf_tasks + n_reg_tasks)) | |
| # need to convert it to a tensor of shape (batch_size, n_clf_tasks + n_reg_tasks) | |
| return logits[:, ::2] / (logits[:, ::2] + logits[:, 1::2]) | |
| def run_paladin( | |
| features: np.ndarray, | |
| aeon_results: Optional[pd.DataFrame] = None, | |
| histology_codes: List[str] = None, | |
| model_map_path: str = None, | |
| model_path: str = None, | |
| metastatic: bool = False, | |
| batch_size: int = BATCH_SIZE, | |
| num_workers: int = NUM_WORKERS, | |
| use_cpu: bool = False, | |
| ): | |
| """Run Paladin inference on a single slide, using the given embeddings | |
| and either a single model or a table mapping histologies and targets to models. | |
| If histology_codes is given, it is a list of OncoTree codes for the slide. | |
| If aeon_predictions_path is given, it is the pathname to a CSV file | |
| with the output of an Aeon run on the slide. | |
| If both are given, an error is raised. | |
| The output is written to the given output_path (a CSV file). | |
| """ | |
| if aeon_results is not None: | |
| aeon_scores = load_aeon_scores(aeon_results) | |
| target_histologies = select_histologies(aeon_scores) | |
| else: | |
| target_histologies = histology_codes | |
| # Build a dataset to feed to the model | |
| site = SiteType.METASTASIS if metastatic else SiteType.PRIMARY | |
| dataset = TileFeatureTensorDataset( | |
| tile_features=features, | |
| site_type=site, | |
| n_max_tiles=20000, | |
| ) | |
| device = torch.device( | |
| "cuda" if not use_cpu and torch.cuda.is_available() else "cpu" | |
| ) | |
| results = [] | |
| if model_path: | |
| histology, target = "None", "None" | |
| try: | |
| score = run_model(device, dataset, model_path, num_workers, batch_size) | |
| results.append((histology, target, score)) | |
| logger.info(f"histology: {histology} target: {target} score: {score}") | |
| except Exception as exc: | |
| logger.error(f"Unable to run model for {histology} target {target}\n{exc}") | |
| elif model_map_path: | |
| model_map = load_model_map(model_map_path) | |
| for histology in target_histologies: | |
| if histology not in model_map: | |
| logger.warning(f"Warning: no models found for {histology}") | |
| continue | |
| for target, model in sorted(model_map[histology].items()): | |
| try: | |
| score = run_model(device, dataset, model, num_workers, batch_size) | |
| results.append((histology, target, score)) | |
| logger.info( | |
| f"histology: {histology} target: {target} score: {score}" | |
| ) | |
| except Exception as exc: | |
| logger.error( | |
| f"Unable to run model for {histology} target {target}\n{exc}" | |
| ) | |
| df = pd.DataFrame(results, columns=["Cancer Subtype", "Biomarker", "Score"]) | |
| return df | |
| def parse_args(): | |
| parser = ArgumentParser(description="Run Paladin inference on a single slide") | |
| parser.add_argument( | |
| "-i", | |
| "--features-path", | |
| required=True, | |
| help="Pathname to a .pt file with optimus embeddings for this slide", | |
| ) | |
| parser.add_argument( | |
| "-o", | |
| "--output-path", | |
| help="The filename for the Paladin predictions file (CSV)", | |
| required=True, | |
| ) | |
| parser.add_argument( | |
| "-c", | |
| "--histology-codes", | |
| help="One or more histologies (OncoTree codes, comma-separated)", | |
| ) | |
| parser.add_argument( | |
| "-a", | |
| "--aeon-predictions-path", | |
| help="Pathname to an aeon-predictions file (CSV) for this slide", | |
| ) | |
| parser.add_argument( | |
| "-mm", | |
| "--model-map-path", | |
| help="A CSV file mapping histologies and targets to Paladin models (.pkl files). Contains columns 'histology', 'target_name', and 'model_path'.", | |
| ) | |
| parser.add_argument( | |
| "-m", | |
| "--model-path", | |
| help="The filename for a Paladin model to run inference with", | |
| ) | |
| parser.add_argument( | |
| "--metastatic", action="store_true", help="Tissue is from a metastatic site" | |
| ) | |
| parser.add_argument("--batch-size", type=int, default=BATCH_SIZE, help="Batch size") | |
| parser.add_argument( | |
| "--num-workers", | |
| type=int, | |
| default=NUM_WORKERS, | |
| help="Number of workers for data loading", | |
| ) | |
| parser.add_argument("--use-cpu", action="store_true", help="Use CPU") | |
| opt = parser.parse_args() | |
| if opt.histology_codes and opt.aeon_predictions_path: | |
| raise UsageError( | |
| "You may specify either --codes or --aeon-predictions-path, but not both." | |
| ) | |
| if opt.histology_codes: | |
| opt.histology_codes = opt.histology_codes.split(",") | |
| if opt.model_path is None and opt.model_map_path is None: | |
| raise UsageError("You must specify either --model-path or --model-map-path") | |
| return opt | |
| def main(): | |
| opt = parse_args() | |
| features = torch.load(opt.features_path) | |
| logger.info(f"Loaded features from {opt.features_path}") | |
| aeon_results = None | |
| if opt.aeon_predictions_path: | |
| aeon_results = pd.read_csv(opt.aeon_predictions_path) | |
| logger.info(f"Loaded Aeon results from {opt.aeon_predictions_path}") | |
| df = run_paladin( | |
| features=features, | |
| aeon_results=aeon_results, | |
| histology_codes=opt.histology_codes, | |
| model_map_path=opt.model_map_path, | |
| model_path=opt.model_path, | |
| metastatic=opt.metastatic, | |
| batch_size=opt.batch_size, | |
| num_workers=opt.num_workers, | |
| use_cpu=opt.use_cpu, | |
| ) | |
| df.to_csv(opt.output_path, index=False) | |
| logger.info(f"Wrote {opt.output_path}") | |
| if __name__ == "__main__": | |
| main() | |