mastefan commited on
Commit
8fc00ee
Β·
verified Β·
1 Parent(s): 0474d38

Update src/app/conversation_core.py

Browse files
Files changed (1) hide show
  1. src/app/conversation_core.py +122 -216
src/app/conversation_core.py CHANGED
@@ -1,53 +1,43 @@
1
 
2
  ###############################################################
3
- # conversation_core.py β€” Agentic Partner Core (Qwen 1.5B + Whisper)
4
  ###############################################################
5
 
6
  import io
7
  import re
8
- import tempfile
9
  from dataclasses import dataclass
10
  from typing import List, Optional, Tuple
11
- from .config import get_user_dir
12
 
13
  import numpy as np
14
- from transformers import pipeline
15
  from pydub import AudioSegment
16
-
17
  import torch
18
  from gtts import gTTS
19
  from transformers import (
20
  AutoTokenizer,
21
  AutoModelForCausalLM,
 
22
  )
23
 
24
- ###############################################################
25
- # MODEL / LANGUAGE CONSTANTS
26
- ###############################################################
27
 
28
- QWEN_MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
29
- WHISPER_MODEL_SIZE = "base" # you can change to "large-v3" if GPU budget allows
30
 
31
- _QWEN_TOKENIZER = None
32
- _QWEN_MODEL = None
33
- _WHISPER = None
34
 
35
- # Spoken language -> Whisper hint
36
- WHISPER_LANG_MAP = {
37
- "english": "en",
38
- "german": "de",
39
- "spanish": "es",
40
- "russian": "ru",
41
- "french": "fr",
42
- "italian": "it",
43
- "japanese": "ja",
44
- "chinese": "zh",
45
- "korean": "ko",
46
- "arabic": "ar",
47
- "hindi": "hi",
48
  }
49
 
50
- # Spoken language -> gTTS language code
51
  GTTS_LANG = {
52
  "english": "en",
53
  "spanish": "es",
@@ -60,26 +50,24 @@ GTTS_LANG = {
60
  "italian": "it",
61
  }
62
 
63
- CONTROL_PROMPTS = {
64
- "A1": "Use extremely short, simple sentences and very basic vocabulary.",
65
- "A2": "Use simple sentences and common everyday vocabulary.",
66
- "B1": "Use moderately complex sentences and conversational vocabulary.",
67
- "B2": "Use natural, fluent sentences with richer vocabulary.",
68
- "C1": "Use complex, advanced sentences with nuanced expressions.",
69
- "C2": "Use highly sophisticated, near-native language and style.",
70
- }
71
 
 
 
 
72
 
73
- ###############################################################
74
- # GLOBAL LOADERS
75
- ###############################################################
76
 
77
- def load_partner_lm() -> Tuple[AutoTokenizer, AutoModelForCausalLM]:
 
 
78
  global _QWEN_TOKENIZER, _QWEN_MODEL
79
- if _QWEN_TOKENIZER is not None and _QWEN_MODEL is not None:
80
  return _QWEN_TOKENIZER, _QWEN_MODEL
81
 
82
- print("[conversation_core] Loading partner LM:", QWEN_MODEL_NAME)
 
83
  tok = AutoTokenizer.from_pretrained(QWEN_MODEL_NAME, trust_remote_code=True)
84
  model = AutoModelForCausalLM.from_pretrained(
85
  QWEN_MODEL_NAME,
@@ -87,58 +75,30 @@ def load_partner_lm() -> Tuple[AutoTokenizer, AutoModelForCausalLM]:
87
  device_map="auto",
88
  trust_remote_code=True,
89
  )
 
90
  _QWEN_TOKENIZER = tok
91
  _QWEN_MODEL = model
92
  return tok, model
93
 
94
 
95
- # ---------------------------------
96
- # OPENAI WHISPER (CPU)
97
- # ---------------------------------
98
- _whisper_model = None
 
99
 
100
- _LANG_HINTS = {
101
- "english": "en",
102
- "spanish": "es",
103
- "german": "de",
104
- "russian": "ru",
105
- "japanese": "ja",
106
- "chinese": "zh",
107
- "korean": "ko",
108
- }
109
-
110
-
111
- ##########################################
112
- # SPEECH RECOGNITION β€” faster-whisper
113
- ##########################################
114
-
115
- def transcribe_audio(audio_segment, spoken_lang=None):
116
- """
117
- Accepts a pydub AudioSegment (mono, 16k).
118
- Returns transcript, detected_language, confidence.
119
- """
120
- global _whisper_model
121
- if _whisper_model is None:
122
- load_whisper()
123
-
124
- import numpy as np
125
-
126
- audio = np.array(audio_segment.get_array_of_samples()).astype("float32") / 32768.0
127
-
128
- segments, info = _whisper_model.transcribe(
129
- audio,
130
- beam_size=5,
131
- language=spoken_lang,
132
  )
133
-
134
- full_text = " ".join([s.text.strip() for s in segments])
135
-
136
- return full_text.strip(), info.language, info.language_probability
137
 
138
 
139
- ###############################################################
140
  # DATA STRUCTURE
141
- ###############################################################
142
 
143
  @dataclass
144
  class ConversationTurn:
@@ -146,104 +106,90 @@ class ConversationTurn:
146
  text: str
147
 
148
 
149
- ###############################################################
150
- # CLEANING
151
- ###############################################################
 
152
 
153
  def clean_assistant_reply(text: str) -> str:
154
- """Strip meta, identity, and obvious junk from LM output."""
155
  if not text:
156
  return ""
157
 
158
- # Remove labels
159
- text = re.sub(r"(?i)\b(user|assistant|system)\s*:\s*", "", text)
 
160
 
161
- # Remove numbered / bullet lists (not wanted in casual chat)
162
  text = re.sub(r"(?m)^\s*[-β€’*]\s+.*$", "", text)
163
  text = re.sub(r"(?m)^\s*\d+\.\s+.*$", "", text)
164
 
165
- # Remove obvious identity / HR / meta nonsense
166
  identity_patterns = [
167
- r"(?i)i am (an?|the)? ?(ai|assistant|speaker|model|natural person).*",
168
- r"(?i)my name is [A-Za-zΓ€ΓΆΓΌΓ„Γ–ΓœΓŸ]+.*",
169
- r"(?i)i was created.*",
170
- r"(?i)human resources manager.*",
171
- r"(?i)job description.*",
172
  r"(?i)i am a large language model.*",
 
 
173
  ]
174
- for pat in identity_patterns:
175
- text = re.sub(pat, "", text)
176
-
177
- # Trim hanging word fragments at the end
178
- text = re.sub(r"[A-Za-zΓ„Γ–ΓœΓ€ΓΆΓΌΓŸ]+$", "", text)
179
 
180
- # Collapse whitespace
181
  text = re.sub(r"\s{2,}", " ", text)
182
  return text.strip()
183
 
184
 
185
- ###############################################################
 
186
  # CONVERSATION MANAGER
187
- ###############################################################
188
 
189
  class ConversationManager:
190
  def __init__(
191
  self,
192
- target_language: str = "german",
193
- native_language: str = "english",
194
- cefr_level: str = "B1",
195
- topic: str = "general conversation",
196
  ):
197
- self.target_language = (target_language or "english").strip().lower()
198
- self.native_language = (native_language or "english").strip().lower()
199
- self.cefr_level = cefr_level or "B1"
200
- self.topic = topic or "general conversation"
201
  self.history: List[ConversationTurn] = []
202
 
203
- # Warm-load models once per session
204
  load_partner_lm()
205
- load_whisper()
206
 
207
- ###########################################################
208
- # PROMPT + GENERATION
209
- ###########################################################
210
 
211
- def _build_system_prompt(self) -> str:
212
  base = (
213
  f"You are a friendly conversation partner speaking {self.target_language}. "
214
  f"Reply ONLY in {self.target_language}. "
215
- f"Do NOT explain grammar, vocabulary, or translations unless the user explicitly asks. "
216
- f"Do NOT describe what the sentence means, do NOT say 'the sentence translates to...', "
217
- f"and do NOT mention that you are explaining anything. "
218
  f"Adapt your language to CEFR level {self.cefr_level}. "
219
  f"{CONTROL_PROMPTS.get(self.cefr_level, '')} "
220
- "Keep your replies natural and conversational, usually 1–3 short sentences. "
221
- "Ask exactly ONE natural follow-up question related to what the user said. "
222
- "Never end the conversation unless the user explicitly ends it. "
223
- "Do NOT say goodbye or conclude unless the user does. "
224
- "Never talk about being an AI, model, or assistant. "
225
- "Do not mention job descriptions, resumes, or HR responsibilities unless the user clearly asks. "
226
  )
227
- if self.topic.strip():
228
- base += f"The main topic of conversation is: {self.topic.strip()}. "
229
  return base
230
 
 
 
 
 
231
  def _generate_lm(self, user_text: str) -> str:
232
  tok, model = load_partner_lm()
233
 
234
- system_prompt = self._build_system_prompt()
235
  messages = [
236
- {"role": "system", "content": system_prompt},
237
- {
238
- "role": "user",
239
- "content": f"The user (who speaks {self.native_language}) said: {user_text}",
240
- },
241
  ]
242
 
243
  prompt = tok.apply_chat_template(
244
- messages,
245
- tokenize=False,
246
- add_generation_prompt=True,
247
  )
248
 
249
  enc = tok(prompt, return_tensors="pt").to(model.device)
@@ -251,66 +197,49 @@ class ConversationManager:
251
  with torch.no_grad():
252
  out = model.generate(
253
  **enc,
254
- max_new_tokens=160, # enough space for natural replies
255
  temperature=0.8,
256
  top_p=0.95,
257
- top_k=50,
258
  repetition_penalty=1.15,
259
- pad_token_id=tok.eos_token_id,
260
  do_sample=True,
 
261
  )
262
 
263
- raw = tok.decode(out[0], skip_special_tokens=True).strip()
264
-
265
- # If the user text is echoed, strip it
266
- if user_text in raw:
267
- raw = raw.split(user_text)[-1].strip()
268
-
269
- # Remove "assistant" label echoes
270
- lines = [
271
- ln for ln in raw.splitlines()
272
- if ln.strip().lower() not in ("assistant", "assistant:")
273
- ]
274
- raw = "\n".join(lines).strip()
275
 
276
- return clean_assistant_reply(raw)
 
 
277
 
278
- ###########################################################
279
  # PUBLIC REPLY API
280
- ###########################################################
281
 
282
- def reply(self, user_text: str, input_lang: str = "german"):
283
- """Generate a reply + explanation + TTS audio."""
284
  self.history.append(ConversationTurn("user", user_text))
285
 
286
  assistant_text = self._generate_lm(user_text)
287
  self.history.append(ConversationTurn("assistant", assistant_text))
288
 
289
  explanation = self._generate_explanation(assistant_text)
290
- audio = self.text_to_speech(assistant_text)
291
 
292
  return {
293
  "reply_text": assistant_text,
294
  "explanation": explanation,
295
- "audio": audio,
296
  }
297
 
298
- ###########################################################
299
- # SHORT EXPLANATION (EN / native language)
300
- ###########################################################
301
 
302
  def _generate_explanation(self, assistant_text: str) -> str:
303
- """Return exactly ONE simple native-language sentence, no meta, no logic."""
304
- if not assistant_text:
305
- return ""
306
-
307
  tok, model = load_partner_lm()
 
308
  prompt = (
309
  f"Rewrite the meaning of this {self.target_language} sentence "
310
- f"in ONE very short {self.native_language} sentence. "
311
- f"Do NOT explain what you are doing, do NOT say 'the sentence means', "
312
- f"do NOT describe tone, and do NOT provide multiple versions.\n"
313
- f"Sentence: \"{assistant_text}\""
314
  )
315
 
316
  enc = tok(prompt, return_tensors="pt").to(model.device)
@@ -323,60 +252,39 @@ class ConversationManager:
323
  pad_token_id=tok.eos_token_id,
324
  )
325
 
326
- raw = tok.decode(out[0], skip_special_tokens=True)
327
- raw = raw.replace(prompt, "").strip()
328
-
329
- # keep first sentence only
330
- parts = re.split(r"(?<=[.!?])\s+", raw)
331
- if parts:
332
- raw = parts[0].strip()
333
 
334
- # remove meta leftovers
335
- raw = re.sub(r"(?i)the sentence.*$", "", raw)
336
- raw = re.sub(r"(?i)this means.*$", "", raw)
337
 
338
- return raw.strip()
 
 
339
 
340
-
341
-
342
- ###########################################################
343
- # AUDIO TRANSCRIPTION β€” Transformers Whisper
344
- ###########################################################
345
-
346
- from transformers import pipeline
347
-
348
- ###########################################################
349
- # AUDIO TRANSCRIPTION β€” Transformers Whisper
350
- ###########################################################
351
-
352
- whisper_pipe = pipeline(
353
- task="automatic-speech-recognition",
354
- model="openai/whisper-small",
355
- device="cpu"
356
- )
357
-
358
  def transcribe(self, audio_segment, spoken_lang=None):
359
- import numpy as np
360
-
 
361
  audio = np.array(audio_segment.get_array_of_samples()).astype("float32")
362
- audio = audio / np.max(np.abs(audio))
363
-
364
- result = whisper_pipe(audio)
365
  text = result.get("text", "").strip()
366
-
367
  return text, spoken_lang or "unknown", 1.0
368
 
369
- ###########################################################
370
- # TEXT β†’ SPEECH
371
- ###########################################################
372
 
373
  def text_to_speech(self, text: str) -> Optional[bytes]:
374
- """Return MP3 bytes for the assistant text, or None on failure."""
375
  if not text:
376
  return None
377
  try:
378
- lang_code = GTTS_LANG.get(self.target_language, "en")
379
- tts = gTTS(text=text, lang=lang_code)
380
  buf = io.BytesIO()
381
  tts.write_to_fp(buf)
382
  return buf.getvalue()
@@ -384,8 +292,6 @@ class ConversationManager:
384
  return None
385
 
386
 
387
- ###############################################################
388
  # END OF FILE
389
- ###############################################################
390
-
391
-
 
1
 
2
  ###############################################################
3
+ # conversation_core.py β€” Agentic Partner Core
4
  ###############################################################
5
 
6
  import io
7
  import re
 
8
  from dataclasses import dataclass
9
  from typing import List, Optional, Tuple
 
10
 
11
  import numpy as np
 
12
  from pydub import AudioSegment
 
13
  import torch
14
  from gtts import gTTS
15
  from transformers import (
16
  AutoTokenizer,
17
  AutoModelForCausalLM,
18
+ pipeline,
19
  )
20
 
21
+ from .config import get_user_dir
 
 
22
 
 
 
23
 
24
+ ################################################################
25
+ # MODEL CONSTANTS
26
+ ################################################################
27
 
28
+ QWEN_MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
29
+
30
+ # CEFR control hints
31
+ CONTROL_PROMPTS = {
32
+ "A1": "Use extremely short, simple sentences and very basic vocabulary.",
33
+ "A2": "Use simple sentences and common everyday vocabulary.",
34
+ "B1": "Use moderately complex sentences and conversational vocabulary.",
35
+ "B2": "Use natural, fluent sentences with richer vocabulary.",
36
+ "C1": "Use complex, advanced sentences with nuanced expressions.",
37
+ "C2": "Use highly sophisticated, near-native language and style.",
 
 
 
38
  }
39
 
40
+ # spoken language β†’ TTS language
41
  GTTS_LANG = {
42
  "english": "en",
43
  "spanish": "es",
 
50
  "italian": "it",
51
  }
52
 
 
 
 
 
 
 
 
 
53
 
54
+ ################################################################
55
+ # GLOBAL MODELS
56
+ ################################################################
57
 
58
+ _QWEN_TOKENIZER = None
59
+ _QWEN_MODEL = None
60
+ _WHISPER_PIPE = None
61
 
62
+
63
+ def load_partner_lm():
64
+ """Load Qwen conversational model once."""
65
  global _QWEN_TOKENIZER, _QWEN_MODEL
66
+ if _QWEN_MODEL is not None:
67
  return _QWEN_TOKENIZER, _QWEN_MODEL
68
 
69
+ print("[conversation_core] loading:", QWEN_MODEL_NAME)
70
+
71
  tok = AutoTokenizer.from_pretrained(QWEN_MODEL_NAME, trust_remote_code=True)
72
  model = AutoModelForCausalLM.from_pretrained(
73
  QWEN_MODEL_NAME,
 
75
  device_map="auto",
76
  trust_remote_code=True,
77
  )
78
+
79
  _QWEN_TOKENIZER = tok
80
  _QWEN_MODEL = model
81
  return tok, model
82
 
83
 
84
+ def load_whisper_pipe():
85
+ """Load Whisper ASR pipeline once."""
86
+ global _WHISPER_PIPE
87
+ if _WHISPER_PIPE is not None:
88
+ return _WHISPER_PIPE
89
 
90
+ print("[conversation_core] loading Whisper pipeline…")
91
+ _WHISPER_PIPE = pipeline(
92
+ "automatic-speech-recognition",
93
+ model="openai/whisper-small",
94
+ device="cpu",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
  )
96
+ return _WHISPER_PIPE
 
 
 
97
 
98
 
99
+ ################################################################
100
  # DATA STRUCTURE
101
+ ################################################################
102
 
103
  @dataclass
104
  class ConversationTurn:
 
106
  text: str
107
 
108
 
109
+
110
+ ################################################################
111
+ # CLEANING LM OUTPUT
112
+ ################################################################
113
 
114
  def clean_assistant_reply(text: str) -> str:
115
+ """Remove meta junk, labels, identity statements."""
116
  if not text:
117
  return ""
118
 
119
+ # Remove "assistant:" echo
120
+ text = re.sub(r"(?i)\bassistant\s*:\s*", "", text)
121
+ text = re.sub(r"(?i)\buser\s*:\s*", "", text)
122
 
123
+ # Remove bullet lists (not desired in conversation)
124
  text = re.sub(r"(?m)^\s*[-β€’*]\s+.*$", "", text)
125
  text = re.sub(r"(?m)^\s*\d+\.\s+.*$", "", text)
126
 
127
+ # Remove identity claims
128
  identity_patterns = [
129
+ r"(?i)i am an ai.*",
 
 
 
 
130
  r"(?i)i am a large language model.*",
131
+ r"(?i)i was created.*",
132
+ r"(?i)my name is .*",
133
  ]
134
+ for p in identity_patterns:
135
+ text = re.sub(p, "", text)
 
 
 
136
 
 
137
  text = re.sub(r"\s{2,}", " ", text)
138
  return text.strip()
139
 
140
 
141
+
142
+ ################################################################
143
  # CONVERSATION MANAGER
144
+ ################################################################
145
 
146
  class ConversationManager:
147
  def __init__(
148
  self,
149
+ target_language="german",
150
+ native_language="english",
151
+ cefr_level="B1",
152
+ topic="general conversation",
153
  ):
154
+ self.target_language = target_language.lower()
155
+ self.native_language = native_language.lower()
156
+ self.cefr_level = cefr_level.upper()
157
+ self.topic = topic
158
  self.history: List[ConversationTurn] = []
159
 
 
160
  load_partner_lm()
161
+ load_whisper_pipe()
162
 
163
+ ################################################################
164
+ # SYSTEM PROMPT
165
+ ################################################################
166
 
167
+ def _build_system_prompt(self):
168
  base = (
169
  f"You are a friendly conversation partner speaking {self.target_language}. "
170
  f"Reply ONLY in {self.target_language}. "
 
 
 
171
  f"Adapt your language to CEFR level {self.cefr_level}. "
172
  f"{CONTROL_PROMPTS.get(self.cefr_level, '')} "
173
+ f"Topic of conversation: {self.topic}. "
174
+ "Give 1–3 short natural sentences and ALWAYS end with 1 follow-up question. "
175
+ "Never mention AI, assistants, grammar explanations, or meta commentary."
 
 
 
176
  )
 
 
177
  return base
178
 
179
+ ################################################################
180
+ # GENERATION
181
+ ################################################################
182
+
183
  def _generate_lm(self, user_text: str) -> str:
184
  tok, model = load_partner_lm()
185
 
 
186
  messages = [
187
+ {"role": "system", "content": self._build_system_prompt()},
188
+ {"role": "user", "content": user_text},
 
 
 
189
  ]
190
 
191
  prompt = tok.apply_chat_template(
192
+ messages, tokenize=False, add_generation_prompt=True
 
 
193
  )
194
 
195
  enc = tok(prompt, return_tensors="pt").to(model.device)
 
197
  with torch.no_grad():
198
  out = model.generate(
199
  **enc,
200
+ max_new_tokens=160,
201
  temperature=0.8,
202
  top_p=0.95,
 
203
  repetition_penalty=1.15,
 
204
  do_sample=True,
205
+ pad_token_id=tok.eos_token_id,
206
  )
207
 
208
+ raw = tok.decode(out[0], skip_special_tokens=True)
 
 
 
 
 
 
 
 
 
 
 
209
 
210
+ # Remove echo
211
+ cleaned = clean_assistant_reply(raw)
212
+ return cleaned
213
 
214
+ ################################################################
215
  # PUBLIC REPLY API
216
+ ################################################################
217
 
218
+ def reply(self, user_text: str, input_lang="german"):
 
219
  self.history.append(ConversationTurn("user", user_text))
220
 
221
  assistant_text = self._generate_lm(user_text)
222
  self.history.append(ConversationTurn("assistant", assistant_text))
223
 
224
  explanation = self._generate_explanation(assistant_text)
225
+ audio_bytes = self.text_to_speech(assistant_text)
226
 
227
  return {
228
  "reply_text": assistant_text,
229
  "explanation": explanation,
230
+ "audio": audio_bytes,
231
  }
232
 
233
+ ################################################################
234
+ # SHORT EXPLANATION
235
+ ################################################################
236
 
237
  def _generate_explanation(self, assistant_text: str) -> str:
 
 
 
 
238
  tok, model = load_partner_lm()
239
+
240
  prompt = (
241
  f"Rewrite the meaning of this {self.target_language} sentence "
242
+ f"in ONE short {self.native_language} sentence:\n{assistant_text}"
 
 
 
243
  )
244
 
245
  enc = tok(prompt, return_tensors="pt").to(model.device)
 
252
  pad_token_id=tok.eos_token_id,
253
  )
254
 
255
+ decoded = tok.decode(out[0], skip_special_tokens=True)
256
+ cleaned = decoded.replace(prompt, "").strip()
 
 
 
 
 
257
 
258
+ # keep only the first sentence
259
+ parts = re.split(r"(?<=[.!?])\s+", cleaned)
260
+ return parts[0].strip()
261
 
262
+ ################################################################
263
+ # TRANSCRIPTION β€” SINGLE VALID VERSION
264
+ ################################################################
265
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
266
  def transcribe(self, audio_segment, spoken_lang=None):
267
+ """Transcribe using Transformers Whisper."""
268
+ pipe = load_whisper_pipe()
269
+
270
  audio = np.array(audio_segment.get_array_of_samples()).astype("float32")
271
+ audio = audio / max(np.max(np.abs(audio)), 1e-6)
272
+
273
+ result = pipe(audio)
274
  text = result.get("text", "").strip()
275
+
276
  return text, spoken_lang or "unknown", 1.0
277
 
278
+ ################################################################
279
+ # TTS β€” gTTS
280
+ ################################################################
281
 
282
  def text_to_speech(self, text: str) -> Optional[bytes]:
 
283
  if not text:
284
  return None
285
  try:
286
+ lang = GTTS_LANG.get(self.target_language, "en")
287
+ tts = gTTS(text=text, lang=lang)
288
  buf = io.BytesIO()
289
  tts.write_to_fp(buf)
290
  return buf.getvalue()
 
292
  return None
293
 
294
 
295
+ ################################################################
296
  # END OF FILE
297
+ ################################################################