raahinaez commited on
Commit
3486df4
Β·
verified Β·
1 Parent(s): cfb98af

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +43 -12
app.py CHANGED
@@ -1,16 +1,25 @@
1
  import os
2
  import gradio as gr
3
  import pdfplumber
4
- from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 
5
 
 
 
 
6
  model_name = "ibm-granite/granite-docling-258M"
 
7
 
8
- # Use HF token stored in Space secrets
9
- hf_token = os.environ.get("HF_HUB_TOKEN")
10
-
11
  tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
12
- model = AutoModelForSeq2SeqLM.from_pretrained(model_name, token=hf_token)
 
13
 
 
 
 
14
  def extract_text_from_pdf(pdf_file):
15
  text = ""
16
  with pdfplumber.open(pdf_file.name) as pdf:
@@ -20,19 +29,41 @@ def extract_text_from_pdf(pdf_file):
20
  text += page_text + "\n"
21
  return text
22
 
 
 
 
23
  def pdf_to_json(pdf_file):
24
  text = extract_text_from_pdf(pdf_file)
25
- inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=2048)
26
- outputs = model.generate(**inputs, max_new_tokens=1024)
27
- result = tokenizer.decode(outputs[0], skip_special_tokens=True)
28
- return result
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
 
 
 
30
  interface = gr.Interface(
31
  fn=pdf_to_json,
32
  inputs=gr.File(file_types=[".pdf"]),
33
  outputs=gr.Textbox(label="Generated JSON"),
34
- title="PDF to JSON using Granite DocLing",
35
- description="Upload a PDF and get a JSON output using ibm-granite/granite-docling-258M."
36
  )
37
 
38
- interface.launch()
 
 
1
  import os
2
  import gradio as gr
3
  import pdfplumber
4
+ import torch
5
+ from transformers import AutoTokenizer, AutoModel
6
 
7
+ # -------------------------
8
+ # Settings
9
+ # -------------------------
10
  model_name = "ibm-granite/granite-docling-258M"
11
+ hf_token = os.environ.get("HF_HUB_TOKEN") # Set this in Space secrets
12
 
13
+ # -------------------------
14
+ # Load tokenizer and model
15
+ # -------------------------
16
  tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
17
+ model = AutoModel.from_pretrained(model_name, token=hf_token)
18
+ model.eval() # inference mode
19
 
20
+ # -------------------------
21
+ # PDF β†’ Text
22
+ # -------------------------
23
  def extract_text_from_pdf(pdf_file):
24
  text = ""
25
  with pdfplumber.open(pdf_file.name) as pdf:
 
29
  text += page_text + "\n"
30
  return text
31
 
32
+ # -------------------------
33
+ # Text β†’ JSON using Granite
34
+ # -------------------------
35
  def pdf_to_json(pdf_file):
36
  text = extract_text_from_pdf(pdf_file)
37
+
38
+ # Chunk text if too long
39
+ max_length = 2048
40
+ chunks = [text[i:i+max_length] for i in range(0, len(text), max_length)]
41
+
42
+ results = []
43
+ for chunk in chunks:
44
+ inputs = tokenizer(chunk, return_tensors="pt", truncation=True, max_length=max_length)
45
+
46
+ with torch.no_grad():
47
+ outputs = model(**inputs)
48
+
49
+ # Decode tokens back to string
50
+ decoded = tokenizer.batch_decode(outputs.last_hidden_state.argmax(-1), skip_special_tokens=True)
51
+ results.append("".join(decoded))
52
+
53
+ # Combine all chunks
54
+ combined_result = "\n".join(results)
55
+ return combined_result
56
 
57
+ # -------------------------
58
+ # Gradio Interface
59
+ # -------------------------
60
  interface = gr.Interface(
61
  fn=pdf_to_json,
62
  inputs=gr.File(file_types=[".pdf"]),
63
  outputs=gr.Textbox(label="Generated JSON"),
64
+ title="PDF to JSON with Granite DocLing",
65
+ description="Upload a PDF and get structured JSON output using ibm-granite/granite-docling-258M."
66
  )
67
 
68
+ if __name__ == "__main__":
69
+ interface.launch()