Spaces:
Runtime error
Runtime error
| # Copyright (c) OpenMMLab. All rights reserved. | |
| import collections | |
| import os | |
| from mmengine.fileio import list_from_file | |
| from transformers import (AutoTokenizer, BartTokenizer, BasicTokenizer, | |
| BertTokenizer, BertTokenizerFast, LlamaTokenizer, | |
| WordpieceTokenizer) | |
| from mmpretrain.registry import TOKENIZER | |
| from .huggingface import register_hf_tokenizer | |
| register_hf_tokenizer(AutoTokenizer) | |
| register_hf_tokenizer(LlamaTokenizer) | |
| class BlipTokenizer(BertTokenizerFast): | |
| """"BlipTokenizer inherit BertTokenizerFast (fast, Rust-based).""" | |
| def from_pretrained( | |
| cls, | |
| pretrained_model_name_or_path, | |
| *init_inputs, | |
| **kwargs, | |
| ): | |
| os.environ['TOKENIZERS_PARALLELISM'] = 'true' | |
| tokenizer = super().from_pretrained( | |
| pretrained_model_name_or_path, | |
| *init_inputs, | |
| **kwargs, | |
| ) | |
| tokenizer.add_special_tokens({'bos_token': '[DEC]'}) | |
| tokenizer.add_special_tokens({'additional_special_tokens': ['[ENC]']}) | |
| return tokenizer | |
| class Blip2Tokenizer(BertTokenizer): | |
| def from_pretrained( | |
| cls, | |
| pretrained_model_name_or_path, | |
| *init_inputs, | |
| **kwargs, | |
| ): | |
| tokenizer = super().from_pretrained( | |
| pretrained_model_name_or_path, | |
| *init_inputs, | |
| **kwargs, | |
| ) | |
| tokenizer.add_special_tokens({'bos_token': '[DEC]'}) | |
| return tokenizer | |
| class OFATokenizer(BartTokenizer): | |
| vocab_files_names = { | |
| 'vocab_file': 'vocab.json', | |
| 'merges_file': 'merges.txt' | |
| } | |
| pretrained_vocab_files_map = { | |
| 'vocab_file': { | |
| 'OFA-Sys/OFA-tiny': | |
| 'https://huggingface.co/OFA-Sys/OFA-tiny/blob/main/vocab.json', | |
| 'OFA-Sys/OFA-medium': | |
| 'https://huggingface.co/OFA-Sys/OFA-medium/blob/main/vocab.json', | |
| 'OFA-Sys/OFA-base': | |
| 'https://huggingface.co/OFA-Sys/OFA-base/blob/main/vocab.json', | |
| 'OFA-Sys/OFA-large': | |
| 'https://huggingface.co/OFA-Sys/OFA-large/blob/main/vocab.json', | |
| }, | |
| 'merges_file': { | |
| 'OFA-Sys/OFA-tiny': | |
| 'https://huggingface.co/OFA-Sys/OFA-tiny/blob/main/merges.txt', | |
| 'OFA-Sys/OFA-medium': | |
| 'https://huggingface.co/OFA-Sys/OFA-medium/blob/main/merges.txt', | |
| 'OFA-Sys/OFA-base': | |
| 'https://huggingface.co/OFA-Sys/OFA-base/blob/main/merges.txt', | |
| 'OFA-Sys/OFA-large': | |
| 'https://huggingface.co/OFA-Sys/OFA-large/blob/main/merges.txt', | |
| }, | |
| } | |
| max_model_input_sizes = { | |
| 'OFA-Sys/OFA-tiny': 1024, | |
| 'OFA-Sys/OFA-medium': 1024, | |
| 'OFA-Sys/OFA-base': 1024, | |
| 'OFA-Sys/OFA-large': 1024, | |
| } | |
| def from_pretrained( | |
| cls, | |
| pretrained_model_name_or_path, | |
| *init_inputs, | |
| **kwargs, | |
| ): | |
| num_bins = kwargs.pop('num_bins', 1000) | |
| tokenizer = super().from_pretrained( | |
| pretrained_model_name_or_path, | |
| *init_inputs, | |
| **kwargs, | |
| ) | |
| length = len(tokenizer) | |
| tokenizer.add_tokens(['<code_{}>'.format(i) for i in range(8192)]) | |
| tokenizer.code_offset = length | |
| tokenizer.add_tokens(['<bin_{}>'.format(i) for i in range(num_bins)]) | |
| tokenizer.bin_offset = length + 8192 | |
| tokenizer.num_bins = num_bins | |
| return tokenizer | |
| class FullTokenizer(BertTokenizer): | |
| """Runs end-to-end tokenziation.""" | |
| def __init__(self, vocab_file, do_lower_case=True): | |
| self.vocab = self.load_vocab(vocab_file) | |
| self.inv_vocab = {v: k for k, v in self.vocab.items()} | |
| self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case) | |
| self.wordpiece_tokenizer = WordpieceTokenizer( | |
| vocab=self.vocab, unk_token='[UNK]', max_input_chars_per_word=200) | |
| def load_vocab(self, vocab_file): | |
| """Loads a vocabulary file into a dictionary.""" | |
| vocab = collections.OrderedDict() | |
| index = 0 | |
| vocab_list = list_from_file(vocab_file) | |
| for token in vocab_list: | |
| if not token: | |
| break | |
| token = token.strip() | |
| vocab[token] = index | |
| index += 1 | |
| return vocab | |
| def tokenize(self, text): | |
| split_tokens = [] | |
| for token in self.basic_tokenizer.tokenize(text): | |
| for sub_token in self.wordpiece_tokenizer.tokenize(token): | |
| split_tokens.append(sub_token) | |
| return split_tokens | |
| def convert_by_vocab(self, vocab, items): | |
| """Converts a sequence of [tokens|ids] using the vocab.""" | |
| output = [] | |
| for item in items: | |
| output.append(vocab[item]) | |
| return output | |
| def convert_tokens_to_ids(self, tokens): | |
| return self.convert_by_vocab(self.vocab, tokens) | |
| def convert_ids_to_tokens(self, ids): | |
| return self.convert_by_vocab(self.inv_vocab, ids) | |
| def convert_tokens_to_string(tokens, clean_up_tokenization_spaces=True): | |
| """Converts a sequence of tokens (string) in a single string.""" | |
| def clean_up_tokenization(out_string): | |
| """Clean up a list of simple English tokenization artifacts like | |
| spaces before punctuations and abbreviated forms.""" | |
| out_string = ( | |
| out_string.replace(' .', '.').replace(' ?', '?').replace( | |
| ' !', '!').replace(' ,', ',').replace(" ' ", "'").replace( | |
| " n't", "n't").replace(" 'm", "'m").replace( | |
| " 's", "'s").replace(" 've", | |
| "'ve").replace(" 're", "'re")) | |
| return out_string | |
| text = ' '.join(tokens).replace(' ##', '').strip() | |
| if clean_up_tokenization_spaces: | |
| clean_text = clean_up_tokenization(text) | |
| return clean_text | |
| else: | |
| return text | |
| def vocab_size(self): | |
| return len(self.vocab) | |