blackccpie commited on
Commit
c507b75
·
1 Parent(s): 724c997

add : initial files versions.

Browse files
Files changed (2) hide show
  1. app.py +283 -0
  2. requirements.txt +6 -0
app.py ADDED
@@ -0,0 +1,283 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # The MIT License
3
+
4
+ # Copyright (c) 2025 Albert Murienne
5
+
6
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
7
+ # of this software and associated documentation files (the "Software"), to deal
8
+ # in the Software without restriction, including without limitation the rights
9
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ # copies of the Software, and to permit persons to whom the Software is
11
+ # furnished to do so, subject to the following conditions:
12
+
13
+ # The above copyright notice and this permission notice shall be included in
14
+ # all copies or substantial portions of the Software.
15
+
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
+ # THE SOFTWARE.
23
+
24
+ import os
25
+ import logging
26
+ import numpy as np
27
+
28
+ from transformers import WhisperProcessor, WhisperForConditionalGeneration
29
+ import librosa
30
+
31
+ from huggingface_hub import InferenceClient
32
+
33
+ from kokoro import KPipeline
34
+
35
+ logging.basicConfig(
36
+ level=logging.INFO,
37
+ format="%(asctime)s [%(levelname)s] %(message)s",
38
+ datefmt="%Y-%m-%d %H:%M:%S"
39
+ )
40
+
41
+ # INITIALIZE MODELS
42
+
43
+ # Load Whisper model and processor
44
+ #modelcard="openai/whisper-tiny"
45
+ modelcard="openai/whisper-small"
46
+ processor = WhisperProcessor.from_pretrained(modelcard)
47
+ model = WhisperForConditionalGeneration.from_pretrained(modelcard)
48
+ forced_decoder_ids = processor.get_decoder_prompt_ids(language="french", task="transcribe")
49
+
50
+ # Set up Hugging Face InferenceClient (for LLM like llama)
51
+ hf = InferenceClient(
52
+ model="google/gemma-2-9b-it",
53
+ provider="groq",
54
+ api_key=os.environ.get("HF_API_KEY")) # remote LLM
55
+
56
+ # Load Kokoro
57
+ tts_pipeline = KPipeline(
58
+ repo_id='hexgrad/Kokoro-82M',
59
+ lang_code="f") # french
60
+
61
+ # Read system prompt from external file
62
+ with open("system_prompt.txt", "r", encoding="utf-8") as f:
63
+ SYSTEM_PROMPT = f.read().strip()
64
+
65
+ # DEFINE JAVASCRIPT FOR GRADIO UI
66
+
67
+ js = """
68
+ async function main() {
69
+ const script1 = document.createElement("script");
70
+ script1.src = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.14.0/dist/ort.js";
71
+ document.head.appendChild(script1)
72
+ const script2 = document.createElement("script");
73
+ script2.onload = async () => {
74
+ console.log("vad loaded") ;
75
+ var record = document.querySelector('.record-button');
76
+ record.textContent = "Just Start Talking!"
77
+ record.style = "width: fit-content; padding-right: 0.5vw;"
78
+ const myvad = await vad.MicVAD.new({
79
+ model: "v5",
80
+ positiveSpeechThreshold: 0.3,
81
+ negativeSpeechThreshold: 0.3,
82
+ minSpeechFrames: 10,
83
+ preSpeechPadFrames: 150,
84
+ onSpeechStart: () => {
85
+ console.log("Speech start detected")
86
+ var record = document.querySelector('.record-button');
87
+ var play_button = document.getElementById("streaming_out").querySelector(".play-pause-button")
88
+ var playing = play_button && (play_button.ariaLabel === "Pause");
89
+ if (record != null && !playing) {
90
+ console.log(record);
91
+ record.click();
92
+ }
93
+ },
94
+ onSpeechEnd: (audio) => {
95
+ console.log("Speech end detected")
96
+ var stop = document.querySelector('.stop-button');
97
+ if (stop != null) {
98
+ console.log(stop);
99
+ stop.click();
100
+ }
101
+ }
102
+ })
103
+ myvad.start()
104
+ }
105
+ script2.src = "https://cdn.jsdelivr.net/npm/@ricky0123/vad-web@0.0.22/dist/bundle.min.js";
106
+ script1.onload = () => {
107
+ console.log("onnx loaded")
108
+ document.head.appendChild(script2)
109
+ };
110
+ }
111
+ """
112
+
113
+ js_reset = """
114
+ () => {
115
+ var record = document.querySelector('.record-button');
116
+ record.textContent = "Just Start Talking!"
117
+ record.style = "width: fit-content; padding-right: 0.5vw;"
118
+ }
119
+ """
120
+
121
+ # DEFINE CALLBACKS
122
+
123
+ @spaces.GPU
124
+ def transcribe(audio_path):
125
+ """
126
+ Transcribe audio file to text using Whisper model.
127
+ Args:
128
+ audio_path (str): Path to the audio file.
129
+ Returns:
130
+ str: Transcribed text.
131
+ """
132
+
133
+ logging.info(f"audio path: {audio_path}") # TODO : check None!!
134
+
135
+ # load and resample local WAV file to 16kHz mono
136
+ audio_array, sampling_rate = librosa.load(audio_path, sr=16000, mono=True)
137
+
138
+ # process audio
139
+ input_features = processor(audio_array, sampling_rate=16000, return_tensors="pt").input_features
140
+
141
+ # generate token ids
142
+ predicted_ids = model.generate(input_features, forced_decoder_ids=forced_decoder_ids)
143
+
144
+ # decode token ids to text
145
+ transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
146
+ logging.info(f"transcription: {transcription[0]}")
147
+
148
+ return transcription[0]
149
+
150
+ def chat_with_llm(query, history):
151
+ """
152
+ Interact with the LLM using the provided query and conversation history.
153
+ Args:
154
+ query (str): User's query.
155
+ history (list): Conversation history as a list of messages.
156
+ Returns:
157
+ str: LLM's response.
158
+ """
159
+
160
+ # prepare messages in OpenAI-style format
161
+ messages = [
162
+ {"role": "system", "content": SYSTEM_PROMPT},
163
+ *history,
164
+ ]
165
+
166
+ logging.info(f"user queried: {query}")
167
+
168
+ answer = hf.chat_completion(messages=messages, max_tokens=512).choices[0].message.content
169
+
170
+ logging.info(f"bot answered: {answer}")
171
+
172
+ return answer
173
+
174
+ @spaces.GPU
175
+ def synthesize(text, voice="ff_siwis"):
176
+ """
177
+ Synthesize text to speech using Kokoro TTS pipeline.
178
+ Args:
179
+ text (str): Text to synthesize.
180
+ voice (str): Voice model to use for synthesis.
181
+ Returns:
182
+ tuple: Sampling rate and audio data as a numpy array.
183
+ """
184
+
185
+ gen = tts_pipeline(text, voice=voice)
186
+ _, _, audio = next(gen)
187
+
188
+ # convert to numpy if it's a tensor
189
+ if hasattr(audio, "detach"):
190
+ audio = audio.detach().cpu().numpy()
191
+ elif not isinstance(audio, np.ndarray):
192
+ audio = np.array(audio)
193
+
194
+ logging.info(f"voice synthesis ready")
195
+
196
+ return (24000, audio)
197
+
198
+ # BUILD THE GRADIO UI
199
+
200
+ import gradio as gr
201
+
202
+ from dataclasses import dataclass, field
203
+
204
+ @dataclass
205
+ class AppState:
206
+ conversation: list = field(default_factory=list)
207
+
208
+ with gr.Blocks(js=js) as demo:
209
+
210
+ state = gr.State(value=AppState())
211
+
212
+ gr.Image("images/sam.png", height=300)
213
+
214
+ input_audio = gr.Audio(
215
+ sources=["microphone"],
216
+ label="Speak",
217
+ type="filepath",
218
+ waveform_options=gr.WaveformOptions(waveform_color="#DB7FBF")
219
+ )
220
+ chatbot = gr.Chatbot(
221
+ label="Conversation",
222
+ type="messages",
223
+ visible=False
224
+ )
225
+ output_audio = gr.Audio(
226
+ label="TTS Response",
227
+ autoplay=True,
228
+ visible=True,
229
+ elem_id="streaming_out"
230
+ )
231
+
232
+ def run_step(state: AppState, audio_path,):
233
+ """
234
+ Process a single step in the conversation.
235
+ Args:
236
+ state (AppState): Current application state.
237
+ audio_path (str): Path to the recorded audio file.
238
+ Yields:
239
+ AppState: Updated application state.
240
+ list: Conversation history.
241
+ tuple: Audio tuple for TTS response.
242
+ """
243
+
244
+ if not input_audio:
245
+ return AppState()
246
+
247
+ user_text = transcribe(audio_path) # now using faster-whisper
248
+ state.conversation.append({"role": "user", "content": user_text})
249
+
250
+ yield state, state.conversation, None
251
+
252
+ # LLM and TTS logic unchanged:
253
+ bot_text = chat_with_llm(user_text, state.conversation)
254
+ state.conversation.append({"role": "assistant", "content": bot_text})
255
+ audio_tuple = synthesize(bot_text)
256
+
257
+ yield state, state.conversation, audio_tuple
258
+
259
+ stream = input_audio.start_recording(
260
+ lambda audio, state: (audio, state),
261
+ [input_audio, state],
262
+ [input_audio, state],
263
+ )
264
+ respond = input_audio.stop_recording(
265
+ run_step,
266
+ [state, input_audio],
267
+ [state, chatbot, output_audio]
268
+ )
269
+ restart = respond.then(
270
+ lambda state: None, [state], [input_audio]).then(
271
+ lambda state: state, state, state, js=js_reset
272
+ )
273
+
274
+ cancel = gr.Button("Restart Conversation", variant="stop")
275
+ cancel.click(
276
+ lambda: (AppState(), gr.Audio(recording=False)),
277
+ None,
278
+ [state, input_audio],
279
+ cancels=[respond, restart],
280
+ )
281
+
282
+ if __name__ == "__main__":
283
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ gradio==5.35.0
2
+ numpy==2.2.6
3
+ transformers==4.52.4
4
+ librosa==0.11.0
5
+ kokoro==0.9.4
6
+ torch==2.5.1