Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
|
@@ -11,12 +11,9 @@ device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
| 11 |
|
| 12 |
# === GPTQ 2-bit QUANTIZATION CONFIG ===
|
| 13 |
quantize_config = BaseQuantizeConfig(
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
compute_dtype=torch.float16,
|
| 18 |
-
use_double_quant=True,
|
| 19 |
-
quant_type="nf4"
|
| 20 |
)
|
| 21 |
|
| 22 |
# === LOAD GPTQ-QUANTIZED MODEL ===
|
|
@@ -132,4 +129,8 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
| 132 |
|
| 133 |
clear_button.click(lambda: ([], []), None, [chatbot, history_state])
|
| 134 |
|
| 135 |
-
demo.launch(ssr_mode=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
# === GPTQ 2-bit QUANTIZATION CONFIG ===
|
| 13 |
quantize_config = BaseQuantizeConfig(
|
| 14 |
+
bits=2, # 2-bit quantization
|
| 15 |
+
group_size=128, # grouping size
|
| 16 |
+
desc_act=False # disable descending activations for speed
|
|
|
|
|
|
|
|
|
|
| 17 |
)
|
| 18 |
|
| 19 |
# === LOAD GPTQ-QUANTIZED MODEL ===
|
|
|
|
| 129 |
|
| 130 |
clear_button.click(lambda: ([], []), None, [chatbot, history_state])
|
| 131 |
|
| 132 |
+
demo.launch(ssr_mode=False)
|
| 133 |
+
|
| 134 |
+
# Note:
|
| 135 |
+
# To get CUDA extensions (nf4, double quant, etc.) back, reinstall AutoGPTQ with CUDA support:
|
| 136 |
+
# pip install git+https://github.com/PanQiWei/AutoGPTQ.git#egg=auto-gptq[cuda]
|