File size: 4,393 Bytes
5ed11a1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
# py/extractive.py
# Extractive summarizer module adapted for deployment (TextRank + TF-IDF)
import re
import os
import random
import networkx as nx
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

try:
    import py_vncorenlp
    _VNCORENLP_AVAILABLE = True
except Exception:
    _VNCORENLP_AVAILABLE = False

# Optional path for stopwords; if absent we'll use None
DEFAULT_STOPWORDS_PATH = os.path.join("data", "Vietnamese-stopwords.txt")


def load_stopwords(path=None):
    p = path or DEFAULT_STOPWORDS_PATH
    if os.path.exists(p):
        try:
            with open(p, "r", encoding="utf-8") as f:
                return [m.strip().lower() for m in f.readlines() if m.strip()]
        except Exception:
            return None
    return None


def clean_text_basic(text):
    if not text:
        return ""
    # Remove captions like "Ảnh: ..." "Video: ..."
    text = re.sub(r'^.*(Ảnh|Video|Clip|Nguồn)\s*:[^(\n)]*.*$', '', text, flags=re.MULTILINE | re.IGNORECASE)
    text = re.sub(r'\(?\[?(Ảnh|Video|Clip)\s*:[^)]+[\)\]]?', '', text, flags=re.IGNORECASE)
    text = re.sub(r'\n\s*[A-ZÁÀẢÃẠÂĂÊÔƠƯĐ\s\.]{2,50}$', '', text.strip(), flags=re.MULTILINE)
    text = re.sub(r'[ \t]+', ' ', text)
    return text.strip()


def split_sentences_fallback(text):
    sentences = re.split(r'(?<=[.!?])\s+', text)
    return [s.strip() for s in sentences if len(s.strip()) > 15]


class ExtractiveSummarizer:
    def __init__(self, stopwords_path=None):
        self.stopwords = load_stopwords(stopwords_path)

        # If vncorenlp is available, try to initialize (may require model download previously)
        self.rdrsegmenter = None
        if _VNCORENLP_AVAILABLE:
            try:
                # Use default model dir if available in environment; if not, py_vncorenlp will try to download
                self.rdrsegmenter = py_vncorenlp.VnCoreNLP()
            except Exception:
                self.rdrsegmenter = None

    def process_text_vncorenlp(self, text):
        text = clean_text_basic(text)
        paragraphs = [p.strip() for p in text.replace('\r\n', '\n').replace('\r', '\n').split('\n') if p.strip()]
        processed_sentences = []
        if self.rdrsegmenter:
            try:
                for p in paragraphs:
                    sentences = self.rdrsegmenter.word_segment(p)
                    for s in sentences:
                        raw_sent = s.replace("_", " ")
                        if len(raw_sent) > 15 and any(c.isalpha() for c in raw_sent):
                            processed_sentences.append({'raw': raw_sent, 'segmented': s})
            except Exception:
                processed_sentences = []

        if not processed_sentences:
            # fallback
            raw_sents = split_sentences_fallback(text)
            processed_sentences = [{'raw': s, 'segmented': s} for s in raw_sents]

        return processed_sentences

    def summarize(self, text, top_n=3):
        if not text or not text.strip():
            return ""
        sent_data = self.process_text_vncorenlp(text)
        if not sent_data:
            return ""
        if len(sent_data) <= top_n:
            # If text too short, return original cleaned text
            return " ".join([s['raw'] for s in sent_data])

        corpus = [item['segmented'] for item in sent_data]
        try:
            vectorizer = TfidfVectorizer(stop_words=self.stopwords)
            tfidf_matrix = vectorizer.fit_transform(corpus)
        except Exception:
            return sent_data[0]['raw'] if sent_data else text

        similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)
        nx_graph = nx.from_numpy_array(similarity_matrix)
        try:
            scores = nx.pagerank(nx_graph, max_iter=500)
        except Exception:
            scores = {i: 1 for i in range(len(sent_data))}

        ranked_sentences = sorted(((scores[i], sent_data[i]['raw'], i)
                                   for i in range(len(sent_data))), reverse=True)
        selected_sentences = ranked_sentences[:top_n]
        selected_sentences.sort(key=lambda x: x[2])
        summary = " ".join([s for _, s, _ in selected_sentences])
        return summary