Added example to run on smaller GPUS
#1
by
buzzcraft - opened
README.md
CHANGED
|
@@ -296,6 +296,53 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
|
|
| 296 |
tokenizer = AutoTokenizer.from_pretrained("norallm/normistral-7b-warm")
|
| 297 |
model = AutoModelForCausalLM.from_pretrained("norallm/normistral-7b-warm").cuda().eval()
|
| 298 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 299 |
# Now we will define the zero-shot prompt template
|
| 300 |
prompt = """Engelsk: {0}
|
| 301 |
Bokmål:"""
|
|
|
|
| 296 |
tokenizer = AutoTokenizer.from_pretrained("norallm/normistral-7b-warm")
|
| 297 |
model = AutoModelForCausalLM.from_pretrained("norallm/normistral-7b-warm").cuda().eval()
|
| 298 |
|
| 299 |
+
# Now we will define the zero-shot prompt template
|
| 300 |
+
prompt = """Engelsk: {0}
|
| 301 |
+
Bokmål:"""
|
| 302 |
+
|
| 303 |
+
# A function that will take care of generating the output
|
| 304 |
+
@torch.no_grad()
|
| 305 |
+
def generate(text):
|
| 306 |
+
text = prompt.format(text)
|
| 307 |
+
input_ids = tokenizer(text, return_tensors='pt').input_ids.cuda()
|
| 308 |
+
prediction = model.generate(
|
| 309 |
+
input_ids,
|
| 310 |
+
max_new_tokens=64,
|
| 311 |
+
do_sample=False,
|
| 312 |
+
eos_token_id=tokenizer('\n').input_ids
|
| 313 |
+
)
|
| 314 |
+
return tokenizer.decode(prediction[0, input_ids.size(1):]).strip()
|
| 315 |
+
|
| 316 |
+
# Now you can simply call the generate function with an English text you want to translate:
|
| 317 |
+
generate("I'm super excited about this Norwegian NORA model! Can it translate these sentences?")
|
| 318 |
+
# > this should output: 'Jeg er super spent på denne norske NORA modellen! Kan den oversette disse setningene?'
|
| 319 |
+
```
|
| 320 |
+
|
| 321 |
+
_____
|
| 322 |
+
## Example usage with low GPU usage
|
| 323 |
+
Install bitsandbytes if you want to load in 8bit
|
| 324 |
+
|
| 325 |
+
```python
|
| 326 |
+
pip install bitsandbytes
|
| 327 |
+
pip install accelerate
|
| 328 |
+
```
|
| 329 |
+
|
| 330 |
+
|
| 331 |
+
```python
|
| 332 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
| 333 |
+
import torch
|
| 334 |
+
|
| 335 |
+
# First, we will have to import the tokenizer and the language model
|
| 336 |
+
tokenizer = AutoTokenizer.from_pretrained("norallm/normistral-7b-warm")
|
| 337 |
+
model = AutoModelForCausalLM.from_pretrained("norallm/normistral-7b-warm",
|
| 338 |
+
device_map='auto',
|
| 339 |
+
load_in_8bit=True,
|
| 340 |
+
torch_dtype=torch.float16)
|
| 341 |
+
# This setup needs about 8gb VRAM
|
| 342 |
+
# Setting load_in_8bit = False, 15gb VRAM
|
| 343 |
+
# Using torch.float32 and load_in_8bit = False, 21gb VRAM
|
| 344 |
+
|
| 345 |
+
|
| 346 |
# Now we will define the zero-shot prompt template
|
| 347 |
prompt = """Engelsk: {0}
|
| 348 |
Bokmål:"""
|