HIEHEU's picture
Upload 10 files
5ed11a1 verified
# py/extractive.py
# Extractive summarizer module adapted for deployment (TextRank + TF-IDF)
import re
import os
import random
import networkx as nx
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
try:
import py_vncorenlp
_VNCORENLP_AVAILABLE = True
except Exception:
_VNCORENLP_AVAILABLE = False
# Optional path for stopwords; if absent we'll use None
DEFAULT_STOPWORDS_PATH = os.path.join("data", "Vietnamese-stopwords.txt")
def load_stopwords(path=None):
p = path or DEFAULT_STOPWORDS_PATH
if os.path.exists(p):
try:
with open(p, "r", encoding="utf-8") as f:
return [m.strip().lower() for m in f.readlines() if m.strip()]
except Exception:
return None
return None
def clean_text_basic(text):
if not text:
return ""
# Remove captions like "Ảnh: ..." "Video: ..."
text = re.sub(r'^.*(Ảnh|Video|Clip|Nguồn)\s*:[^(\n)]*.*$', '', text, flags=re.MULTILINE | re.IGNORECASE)
text = re.sub(r'\(?\[?(Ảnh|Video|Clip)\s*:[^)]+[\)\]]?', '', text, flags=re.IGNORECASE)
text = re.sub(r'\n\s*[A-ZÁÀẢÃẠÂĂÊÔƠƯĐ\s\.]{2,50}$', '', text.strip(), flags=re.MULTILINE)
text = re.sub(r'[ \t]+', ' ', text)
return text.strip()
def split_sentences_fallback(text):
sentences = re.split(r'(?<=[.!?])\s+', text)
return [s.strip() for s in sentences if len(s.strip()) > 15]
class ExtractiveSummarizer:
def __init__(self, stopwords_path=None):
self.stopwords = load_stopwords(stopwords_path)
# If vncorenlp is available, try to initialize (may require model download previously)
self.rdrsegmenter = None
if _VNCORENLP_AVAILABLE:
try:
# Use default model dir if available in environment; if not, py_vncorenlp will try to download
self.rdrsegmenter = py_vncorenlp.VnCoreNLP()
except Exception:
self.rdrsegmenter = None
def process_text_vncorenlp(self, text):
text = clean_text_basic(text)
paragraphs = [p.strip() for p in text.replace('\r\n', '\n').replace('\r', '\n').split('\n') if p.strip()]
processed_sentences = []
if self.rdrsegmenter:
try:
for p in paragraphs:
sentences = self.rdrsegmenter.word_segment(p)
for s in sentences:
raw_sent = s.replace("_", " ")
if len(raw_sent) > 15 and any(c.isalpha() for c in raw_sent):
processed_sentences.append({'raw': raw_sent, 'segmented': s})
except Exception:
processed_sentences = []
if not processed_sentences:
# fallback
raw_sents = split_sentences_fallback(text)
processed_sentences = [{'raw': s, 'segmented': s} for s in raw_sents]
return processed_sentences
def summarize(self, text, top_n=3):
if not text or not text.strip():
return ""
sent_data = self.process_text_vncorenlp(text)
if not sent_data:
return ""
if len(sent_data) <= top_n:
# If text too short, return original cleaned text
return " ".join([s['raw'] for s in sent_data])
corpus = [item['segmented'] for item in sent_data]
try:
vectorizer = TfidfVectorizer(stop_words=self.stopwords)
tfidf_matrix = vectorizer.fit_transform(corpus)
except Exception:
return sent_data[0]['raw'] if sent_data else text
similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)
nx_graph = nx.from_numpy_array(similarity_matrix)
try:
scores = nx.pagerank(nx_graph, max_iter=500)
except Exception:
scores = {i: 1 for i in range(len(sent_data))}
ranked_sentences = sorted(((scores[i], sent_data[i]['raw'], i)
for i in range(len(sent_data))), reverse=True)
selected_sentences = ranked_sentences[:top_n]
selected_sentences.sort(key=lambda x: x[2])
summary = " ".join([s for _, s, _ in selected_sentences])
return summary