import gradio as gr import PyPDF2 import io from transformers import pipeline, AutoTokenizer import torch import re from typing import List, Tuple import warnings warnings.filterwarnings("ignore") class PDFSummarizer: def __init__(self): # Use a much faster, lighter model for summarization self.model_name = "sshleifer/distilbart-cnn-12-6" # Much faster than BART-large self.device = "cuda" if torch.cuda.is_available() else "cpu" print(f"Using device: {self.device}") try: # Initialize the summarization pipeline with optimizations self.summarizer = pipeline( "summarization", model=self.model_name, device=0 if self.device == "cuda" else -1, framework="pt", model_kwargs={"torch_dtype": torch.float16 if self.device == "cuda" else torch.float32} ) # Initialize tokenizer for length calculations self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) print("Model loaded successfully") except Exception as e: print(f"Error loading model: {e}") # Fallback to an even faster model self.model_name = "facebook/bart-large-cnn" self.summarizer = pipeline("summarization", model=self.model_name, device=-1) self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) print("Fallback model loaded") def extract_text_from_pdf(self, pdf_file) -> str: """Extract text content from PDF file""" try: pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_file)) text = "" for page_num, page in enumerate(pdf_reader.pages): page_text = page.extract_text() if page_text.strip(): text += f"\n--- Page {page_num + 1} ---\n" text += page_text return text.strip() except Exception as e: raise Exception(f"Error extracting text from PDF: {str(e)}") def clean_text(self, text: str) -> str: """Clean and preprocess text""" # Remove extra whitespaces and newlines text = re.sub(r'\s+', ' ', text) # Remove special characters but keep punctuation text = re.sub(r'[^\w\s.,!?;:()\-"]', ' ', text) # Remove page markers text = re.sub(r'--- Page \d+ ---', '', text) return text.strip() def chunk_text(self, text: str, max_chunk_length: int = 512) -> List[str]: """Split text into smaller, more manageable chunks for faster processing""" sentences = text.split('. ') chunks = [] current_chunk = "" for sentence in sentences: # Check if adding this sentence would exceed the limit potential_chunk = current_chunk + sentence + ". " # Use faster length estimation if len(potential_chunk.split()) <= max_chunk_length: current_chunk = potential_chunk else: if current_chunk: chunks.append(current_chunk.strip()) current_chunk = sentence + ". " if current_chunk: chunks.append(current_chunk.strip()) # Limit number of chunks for speed return chunks[:5] # Process max 5 chunks for speed def summarize_chunk(self, chunk: str, max_length: int = 100, min_length: int = 30) -> str: """Summarize a single chunk of text with speed optimizations""" try: # Speed optimizations summary = self.summarizer( chunk, max_length=max_length, min_length=min_length, do_sample=False, truncation=True, early_stopping=True, num_beams=2 # Reduced from default 4 for speed ) return summary[0]['summary_text'] except Exception as e: return f"Error summarizing chunk: {str(e)}" def process_pdf(self, pdf_file, summary_type: str) -> Tuple[str, str, str]: """Main function to process PDF and generate summary""" try: # Extract text from PDF raw_text = self.extract_text_from_pdf(pdf_file) if not raw_text.strip(): return "❌ Error: No text could be extracted from the PDF.", "", "" # Clean the text cleaned_text = self.clean_text(raw_text) # Calculate text statistics word_count = len(cleaned_text.split()) char_count = len(cleaned_text) if word_count < 50: return "❌ Error: PDF contains too little text to summarize.", "", "" # Chunk the text for processing chunks = self.chunk_text(cleaned_text) # Determine summary parameters based on type (optimized for speed) if summary_type == "Brief (Quick)": max_len, min_len = 60, 20 elif summary_type == "Detailed": max_len, min_len = 100, 40 else: # Comprehensive max_len, min_len = 150, 60 # Summarize each chunk (with progress tracking) chunk_summaries = [] for i, chunk in enumerate(chunks): print(f"Processing chunk {i+1}/{len(chunks)}") summary = self.summarize_chunk(chunk, max_len, min_len) chunk_summaries.append(summary) # Combine summaries combined_summary = " ".join(chunk_summaries) # Skip final summarization for speed if we have few chunks if len(chunks) <= 2: final_summary = combined_summary else: # Quick final summary for multiple chunks final_summary = self.summarize_chunk( combined_summary, max_length=min(200, max_len * 1.5), min_length=min_len ) # Create statistics summary_stats = f""" 📊 **Document Statistics:** - Original word count: {word_count:,} - Original character count: {char_count:,} - Pages processed: {len(chunks)} - Summary word count: {len(final_summary.split()):,} - Compression ratio: {word_count / len(final_summary.split()):.1f}:1 """ return final_summary, summary_stats, "✅ Summary generated successfully!" except Exception as e: return f"❌ Error processing PDF: {str(e)}", "", "" # Initialize the summarizer pdf_summarizer = PDFSummarizer() def summarize_pdf_interface(pdf_file, summary_type): """Gradio interface function""" if pdf_file is None: return "❌ Please upload a PDF file.", "", "" try: # Read the uploaded file - pdf_file is already the file path with open(pdf_file, 'rb') as f: pdf_content = f.read() # Process the PDF summary, stats, status = pdf_summarizer.process_pdf(pdf_content, summary_type) return summary, stats, status except Exception as e: return f"❌ Error: {str(e)}", "", "" # Create Gradio interface def create_interface(): with gr.Blocks( title="📄 AI PDF Summarizer", theme=gr.themes.Soft(), css=""" .gradio-container { max-width: 1200px !important; } .summary-box { border-left: 4px solid #2196F3; padding: 16px; background-color: #f8f9fa; } """ ) as interface: gr.Markdown(""" # 📄 AI-Powered PDF Summarizer Upload any PDF document and get an intelligent summary in seconds! Perfect for research papers, reports, articles, and books. **Features:** - ⚡ Fast processing with BART model - 📊 Document statistics - 🎯 Multiple summary lengths - 🔍 Smart text chunking """) with gr.Row(): with gr.Column(scale=1): pdf_input = gr.File( label="📁 Upload PDF File", file_types=[".pdf"], type="filepath" ) summary_type = gr.Radio( choices=["Brief (Quick)", "Detailed", "Comprehensive"], value="Detailed", label="📏 Summary Length", info="Choose how detailed you want the summary to be" ) summarize_btn = gr.Button( "🚀 Generate Summary", variant="primary", size="lg" ) status_output = gr.Textbox( label="📋 Status", interactive=False, max_lines=2 ) with gr.Column(scale=2): summary_output = gr.Textbox( label="📝 Generated Summary", lines=15, max_lines=20, interactive=False, elem_classes=["summary-box"] ) stats_output = gr.Markdown( label="📊 Document Statistics", value="Upload a PDF to see statistics" ) # Examples section gr.Markdown(""" ## 💡 Tips for Best Results: - **File Quality**: Ensure your PDF has selectable text (not just images) - **Length**: Works best with documents between 500-10,000 words - **Language**: Optimized for English content - **Format**: Clean, well-formatted PDFs produce better summaries ## 🔧 Technical Details: - **Model**: Facebook BART-Large-CNN (state-of-the-art summarization) - **Processing**: Smart text chunking with overlap prevention - **Speed**: GPU-accelerated when available """) # Connect the button to the function summarize_btn.click( fn=summarize_pdf_interface, inputs=[pdf_input, summary_type], outputs=[summary_output, stats_output, status_output] ) # Auto-process when file is uploaded pdf_input.change( fn=summarize_pdf_interface, inputs=[pdf_input, summary_type], outputs=[summary_output, stats_output, status_output] ) return interface # Launch the application if __name__ == "__main__": interface = create_interface() interface.launch()