rntc commited on
Commit
f12fd7f
·
verified ·
1 Parent(s): 6b6abc7

Upload folder using huggingface_hub

Browse files
Files changed (4) hide show
  1. README.md +24 -4
  2. __pycache__/app.cpython-312.pyc +0 -0
  3. app.py +320 -0
  4. requirements.txt +3 -0
README.md CHANGED
@@ -1,12 +1,32 @@
1
  ---
2
  title: VizAnnot
3
- emoji: 🌖
4
  colorFrom: blue
5
- colorTo: yellow
6
  sdk: gradio
7
- sdk_version: 6.0.1
8
  app_file: app.py
9
  pinned: false
 
 
 
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  title: VizAnnot
3
+ emoji: 🔬
4
  colorFrom: blue
5
+ colorTo: green
6
  sdk: gradio
7
+ sdk_version: 4.44.0
8
  app_file: app.py
9
  pinned: false
10
+ license: mit
11
+ datasets:
12
+ - rntc/biomed-fr-pancreas-annotations
13
  ---
14
 
15
+ # Pancreas Cancer Clinical Report Annotations Explorer
16
+
17
+ This Gradio app allows you to explore annotations extracted from synthetic French clinical reports about pancreas cancer.
18
+
19
+ ## Features
20
+
21
+ - **Highlighted text view**: Clinical reports with colored spans showing where information was extracted
22
+ - **Hover tooltips**: Hover over highlighted spans to see the variable name and extracted value
23
+ - **Annotations table**: Complete table of all extracted variables with values and source spans
24
+ - **Search**: Search through clinical reports by text content
25
+
26
+ ## Dataset
27
+
28
+ The app loads data from [rntc/biomed-fr-pancreas-annotations](https://huggingface.co/datasets/rntc/biomed-fr-pancreas-annotations).
29
+
30
+ ## Usage
31
+
32
+ Use the slider to navigate between samples, or use the search box to find specific content.
__pycache__/app.cpython-312.pyc ADDED
Binary file (17 kB). View file
 
app.py ADDED
@@ -0,0 +1,320 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Gradio app to explore pancreas or lymphome clinical report annotations.
3
+ """
4
+
5
+ import os
6
+ from functools import partial
7
+ from pathlib import Path
8
+
9
+ import gradio as gr
10
+ from datasets import load_dataset
11
+
12
+ MIN_ANNOTATIONS = 10
13
+ PANCREAS_REPO_ID = os.getenv("PANCREAS_REPO_ID", "rntc/biomed-fr-pancreas-annotations")
14
+ LYMPHOME_REPO_ID = os.getenv("LYMPHOME_REPO_ID", "rntc/biomed-fr-lymphome-annotations")
15
+ LYMPHOME_LOCAL_JSONL = (
16
+ Path(__file__).resolve().parent.parent
17
+ / "Qwen--Qwen3-235B-A22B-Instruct-2507-FP8-4-lymphome-annotation-20251201_153807.jsonl"
18
+ )
19
+
20
+ # Colors for highlighting
21
+ COLORS = [
22
+ "#FFEB3B",
23
+ "#4CAF50",
24
+ "#2196F3",
25
+ "#FF9800",
26
+ "#E91E63",
27
+ "#9C27B0",
28
+ "#00BCD4",
29
+ "#8BC34A",
30
+ "#FF5722",
31
+ "#607D8B",
32
+ ]
33
+
34
+
35
+ def count_real_annotations(annotation):
36
+ """Count real annotations (excluding 'not found' placeholders)."""
37
+ count = 0
38
+ for var_data in annotation.values():
39
+ if var_data and isinstance(var_data, dict):
40
+ value = var_data.get("value")
41
+ span = var_data.get("span", "")
42
+ if value:
43
+ if span and "pas de mention" in span.lower():
44
+ continue
45
+ if "not performed" in str(value).lower():
46
+ continue
47
+ count += 1
48
+ return count
49
+
50
+
51
+ def escape_html(text):
52
+ if not text:
53
+ return ""
54
+ return str(text).replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")
55
+
56
+
57
+ def highlight_text(cr_text, annotation):
58
+ """Highlight spans in CR text."""
59
+ if not cr_text or not annotation:
60
+ return f"<pre style='white-space:pre-wrap;'>{escape_html(cr_text)}</pre>"
61
+
62
+ # Collect valid spans (that exist in text)
63
+ spans = []
64
+ for var_name, var_data in annotation.items():
65
+ if var_data and isinstance(var_data, dict):
66
+ span = var_data.get("span")
67
+ value = var_data.get("value")
68
+ if span and value and span in cr_text:
69
+ spans.append(
70
+ {
71
+ "text": span,
72
+ "start": cr_text.find(span),
73
+ "var": var_name.replace("_", " ").title(),
74
+ "value": str(value),
75
+ }
76
+ )
77
+
78
+ if not spans:
79
+ return f"<pre style='white-space:pre-wrap;'>{escape_html(cr_text)}</pre>"
80
+
81
+ # Sort by position and remove overlaps
82
+ spans.sort(key=lambda x: x["start"])
83
+ filtered = []
84
+ for s in spans:
85
+ s["end"] = s["start"] + len(s["text"])
86
+ if not filtered or s["start"] >= filtered[-1]["end"]:
87
+ filtered.append(s)
88
+
89
+ # Build HTML
90
+ html = []
91
+ pos = 0
92
+ color_map = {}
93
+ color_idx = 0
94
+
95
+ for s in filtered:
96
+ if s["start"] > pos:
97
+ html.append(escape_html(cr_text[pos : s["start"]]))
98
+
99
+ if s["var"] not in color_map:
100
+ color_map[s["var"]] = COLORS[color_idx % len(COLORS)]
101
+ color_idx += 1
102
+
103
+ color = color_map[s["var"]]
104
+ html.append(
105
+ f'<mark style="background:{color};padding:1px 3px;border-radius:3px;" '
106
+ f'title="{escape_html(s["var"])}: {escape_html(s["value"])}">'
107
+ f'{escape_html(s["text"])}</mark>'
108
+ )
109
+ pos = s["end"]
110
+
111
+ if pos < len(cr_text):
112
+ html.append(escape_html(cr_text[pos:]))
113
+
114
+ return f"<pre style='white-space:pre-wrap;line-height:1.6;'>{''.join(html)}</pre>"
115
+
116
+
117
+ def format_table(annotation):
118
+ """Format annotations as HTML table."""
119
+ if not annotation:
120
+ return "<p>No annotations</p>"
121
+
122
+ rows = []
123
+ for var_name, var_data in annotation.items():
124
+ if var_data and isinstance(var_data, dict):
125
+ value = var_data.get("value")
126
+ span = var_data.get("span", "")
127
+
128
+ var_label = var_name.replace("_", " ").title()
129
+
130
+ if value:
131
+ if span and "pas de mention" in span.lower():
132
+ display_value = "/"
133
+ display_span = ""
134
+ elif "not performed" in str(value).lower():
135
+ display_value = "/"
136
+ display_span = ""
137
+ else:
138
+ display_value = str(value)
139
+ display_span = span[:60] + "..." if span and len(span) > 60 else (span or "")
140
+ else:
141
+ display_value = "/"
142
+ display_span = ""
143
+
144
+ rows.append(
145
+ f"""<tr>
146
+ <td style="padding:6px 10px;border-bottom:1px solid #ddd;font-weight:500;">{escape_html(var_label)}</td>
147
+ <td style="padding:6px 10px;border-bottom:1px solid #ddd;color:#1565C0;">{escape_html(display_value)}</td>
148
+ <td style="padding:6px 10px;border-bottom:1px solid #ddd;color:#666;font-size:12px;font-style:italic;">{escape_html(display_span)}</td>
149
+ </tr>"""
150
+ )
151
+
152
+ return f"""<table style="width:100%;border-collapse:collapse;font-size:13px;">
153
+ <thead><tr style="background:#f5f5f5;">
154
+ <th style="padding:8px 10px;text-align:left;border-bottom:2px solid #ddd;">Variable</th>
155
+ <th style="padding:8px 10px;text-align:left;border-bottom:2px solid #ddd;">Value</th>
156
+ <th style="padding:8px 10px;text-align:left;border-bottom:2px solid #ddd;">Source</th>
157
+ </tr></thead>
158
+ <tbody>{"".join(rows)}</tbody>
159
+ </table>"""
160
+
161
+
162
+ def load_pancreas_dataset():
163
+ print(f"Loading pancreas dataset from {PANCREAS_REPO_ID}...")
164
+ dataset = load_dataset(PANCREAS_REPO_ID, split="train")
165
+ print(f"Loaded {len(dataset)} pancreas samples")
166
+ return dataset
167
+
168
+
169
+ def load_lymphome_dataset():
170
+ print(f"Loading lymphome dataset from {LYMPHOME_REPO_ID} (Hub)...")
171
+ try:
172
+ dataset = load_dataset(LYMPHOME_REPO_ID, split="train")
173
+ print(f"Loaded {len(dataset)} lymphome samples from Hub")
174
+ return dataset
175
+ except Exception as exc: # noqa: BLE001 (we want to surface any failure)
176
+ print(f"Failed to load lymphome dataset from Hub: {exc}")
177
+ if LYMPHOME_LOCAL_JSONL.exists():
178
+ print(f"Falling back to local lymphome JSONL at {LYMPHOME_LOCAL_JSONL}")
179
+ dataset = load_dataset("json", data_files=str(LYMPHOME_LOCAL_JSONL), split="train")
180
+ print(f"Loaded {len(dataset)} lymphome samples from local file")
181
+ return dataset
182
+ raise
183
+
184
+
185
+ def filter_indices(dataset, min_annotations):
186
+ return [
187
+ i
188
+ for i, sample in enumerate(dataset)
189
+ if count_real_annotations(sample.get("annotation", {})) >= min_annotations
190
+ ]
191
+
192
+
193
+ def prepare_source(key, label, loader, min_annotations):
194
+ """Load a dataset source and precompute filtered indices."""
195
+ try:
196
+ dataset = loader()
197
+ filtered = filter_indices(dataset, min_annotations)
198
+ print(f"{label}: filtered to {len(filtered)} samples with >= {min_annotations} annotations")
199
+ return {
200
+ "label": label,
201
+ "dataset": dataset,
202
+ "filtered_indices": filtered,
203
+ "min_annotations": min_annotations,
204
+ "error": None,
205
+ }
206
+ except Exception as exc: # noqa: BLE001 (we want to surface any failure)
207
+ print(f"Failed to load {label}: {exc}")
208
+ return {
209
+ "label": label,
210
+ "dataset": None,
211
+ "filtered_indices": [],
212
+ "min_annotations": min_annotations,
213
+ "error": str(exc),
214
+ }
215
+
216
+
217
+ SOURCES = {
218
+ "pancreas": prepare_source("pancreas", "Pancréas", load_pancreas_dataset, MIN_ANNOTATIONS),
219
+ "lymphome": prepare_source("lymphome", "Lymphome", load_lymphome_dataset, MIN_ANNOTATIONS),
220
+ }
221
+
222
+
223
+ def display_sample_for_source(source_key, slider_idx):
224
+ """Display a sample for a given dataset source."""
225
+ source = SOURCES[source_key]
226
+
227
+ if source["error"]:
228
+ message = f"Dataset unavailable: {source['error']}"
229
+ return message, message, message
230
+
231
+ if not source["filtered_indices"]:
232
+ message = f"No samples with >= {source['min_annotations']} annotations."
233
+ return message, message, message
234
+
235
+ slider_idx = int(slider_idx)
236
+ if slider_idx < 0 or slider_idx >= len(source["filtered_indices"]):
237
+ return "Invalid", "Invalid", "Invalid"
238
+
239
+ real_idx = source["filtered_indices"][slider_idx]
240
+ sample = source["dataset"][real_idx]
241
+
242
+ original = sample.get("original_text", "")
243
+ cr = sample.get("CR", "")
244
+ annotation = sample.get("annotation", {})
245
+
246
+ n_annotations = count_real_annotations(annotation)
247
+
248
+ original_html = f"<pre style='white-space:pre-wrap;line-height:1.6;'>{escape_html(original)}</pre>"
249
+ cr_html = (
250
+ f"<p><b>Sample #{real_idx}</b> — {n_annotations} annotations</p>"
251
+ + highlight_text(cr, annotation)
252
+ )
253
+
254
+ return original_html, cr_html, format_table(annotation)
255
+
256
+
257
+ def build_tab(source_key):
258
+ source = SOURCES[source_key]
259
+ label = source["label"]
260
+
261
+ with gr.TabItem(label):
262
+ if source["error"]:
263
+ gr.Markdown(f"⚠️ Could not load {label} dataset: {escape_html(source['error'])}")
264
+ return
265
+
266
+ if not source["filtered_indices"]:
267
+ gr.Markdown(f"⚠️ No samples with >= {source['min_annotations']} annotations.")
268
+ return
269
+
270
+ gr.Markdown(
271
+ f"Showing {len(source['filtered_indices'])} samples with >= "
272
+ f"{source['min_annotations']} annotations. Hover over highlights to see values."
273
+ )
274
+
275
+ with gr.Row():
276
+ slider = gr.Slider(
277
+ 0,
278
+ len(source["filtered_indices"]) - 1,
279
+ value=0,
280
+ step=1,
281
+ label="Sample",
282
+ )
283
+
284
+ with gr.Row():
285
+ with gr.Column():
286
+ gr.Markdown("### Original (English)")
287
+ original_html = gr.HTML()
288
+ with gr.Column():
289
+ gr.Markdown("### Generated CR (French)")
290
+ cr_html = gr.HTML()
291
+ with gr.Column():
292
+ gr.Markdown("### Extracted Variables")
293
+ table_html = gr.HTML()
294
+
295
+ slider.change(
296
+ fn=partial(display_sample_for_source, source_key),
297
+ inputs=[slider],
298
+ outputs=[original_html, cr_html, table_html],
299
+ )
300
+ demo.load(
301
+ fn=partial(display_sample_for_source, source_key),
302
+ inputs=[slider],
303
+ outputs=[original_html, cr_html, table_html],
304
+ )
305
+
306
+
307
+ # Build UI
308
+ with gr.Blocks(title="Clinical Annotations Explorer", theme=gr.themes.Base()) as demo:
309
+ gr.Markdown("# 🔬 Clinical Annotation Explorer")
310
+ gr.Markdown(
311
+ "Use the tabs below to switch between pancreas and lymphome annotations. "
312
+ "Hover over highlights to see the extracted values."
313
+ )
314
+
315
+ with gr.Tabs():
316
+ build_tab("pancreas")
317
+ build_tab("lymphome")
318
+
319
+ if __name__ == "__main__":
320
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ gradio==4.44.0
2
+ datasets
3
+ huggingface_hub<0.27