"""NucEL k-mer tokenizer. This is the single source of truth for the NucEL tokenizer. It is identical to the ``tokenizer.py`` shipped with ``FreakingPotato/NucEL`` on the Hugging Face Hub. The vocabulary follows the layout reported in the paper (Section 3.3): 7 special tokens + 4 nucleotides + 4**k k-mers + 16 reserved tokens For the published checkpoint, ``k = 1`` so the vocabulary size is:: 7 + 4 + 4 + 16 = 31 (vocab indices 0..30, model ``vocab_size`` is 27 + padding to the next embedding row). The discriminator's ``vocab_size`` in ``config.json`` is 27; the 4 trailing reserved slots exist on the tokenizer side but are not addressed by the model weights. They are reserved for future extensions to the alphabet. """ from itertools import product from pathlib import Path from typing import Any, Dict, List, Optional, Tuple, Union import json import os from transformers import PreTrainedTokenizer class NucEL_Tokenizer(PreTrainedTokenizer): """k-mer tokenizer for DNA sequences used by NucEL.""" model_input_names = ["input_ids", "attention_mask"] def __init__( self, k: int = 1, model_max_length: int = 2048, pad_token: str = "[PAD]", unk_token: str = "[UNK]", sep_token: str = "[SEP]", cls_token: str = "[CLS]", mask_token: str = "[MASK]", bos_token: str = "[BOS]", eos_token: str = "[EOS]", num_reserved_tokens: int = 16, **kwargs: Any, ) -> None: self.k = k self.nucleotides = ["A", "C", "G", "T"] self.num_reserved_tokens = num_reserved_tokens self.special_tokens = { "pad_token": pad_token, "unk_token": unk_token, "sep_token": sep_token, "cls_token": cls_token, "mask_token": mask_token, "bos_token": bos_token, "eos_token": eos_token, } self._init_vocabulary() super().__init__( model_max_length=model_max_length, pad_token=pad_token, unk_token=unk_token, sep_token=sep_token, cls_token=cls_token, mask_token=mask_token, bos_token=bos_token, eos_token=eos_token, **kwargs, ) def _init_vocabulary(self) -> None: special_tokens = [ self.special_tokens["pad_token"], self.special_tokens["unk_token"], self.special_tokens["cls_token"], self.special_tokens["sep_token"], self.special_tokens["mask_token"], self.special_tokens["bos_token"], self.special_tokens["eos_token"], ] kmers = ["".join(p) for p in product(self.nucleotides, repeat=self.k)] reserved = [f"[RESERVED_{i}]" for i in range(self.num_reserved_tokens)] all_tokens = special_tokens + self.nucleotides + kmers + reserved self.vocab = {token: idx for idx, token in enumerate(all_tokens)} self.ids_to_tokens = {idx: token for token, idx in self.vocab.items()} @property def vocab_size(self) -> int: return len(self.vocab) def get_vocab(self) -> Dict[str, int]: return self.vocab.copy() def _tokenize(self, text: str) -> List[str]: text = text.upper().strip() tokens = [self.cls_token] i = 0 while i < len(text): if i <= len(text) - self.k: kmer = text[i : i + self.k] if kmer in self.vocab: tokens.append(kmer) i += self.k continue if i < len(text): nt = text[i] tokens.append(nt if nt in self.nucleotides else self.unk_token) i += 1 return tokens def _convert_token_to_id(self, token: str) -> int: return self.vocab.get(token, self.vocab[self.unk_token]) def _convert_id_to_token(self, index: int) -> str: return self.ids_to_tokens.get(index, self.unk_token) def save_vocabulary( self, save_directory: str, filename_prefix: Optional[str] = None, ) -> Tuple[str]: prefix = filename_prefix or "vocab" vocab_file = os.path.join(save_directory, f"{prefix}.json") with open(vocab_file, "w", encoding="utf-8") as f: json.dump(self.vocab, f, ensure_ascii=False, indent=2) return (vocab_file,) def save_pretrained( self, save_directory: str, legacy_format: bool = True, filename_prefix: Optional[str] = None, **kwargs: Any, ): vocab_files = self.save_vocabulary(save_directory, filename_prefix=filename_prefix) return super().save_pretrained( save_directory, legacy_format=legacy_format, **kwargs, ) or vocab_files @classmethod def from_pretrained( cls, pretrained_model_name_or_path: Union[str, os.PathLike], *init_inputs: Any, **kwargs: Any, ) -> "NucEL_Tokenizer": path = Path(pretrained_model_name_or_path) if not path.is_dir(): from huggingface_hub import snapshot_download local_dir = snapshot_download( repo_id=str(pretrained_model_name_or_path), allow_patterns=[ "tokenizer_config.json", "vocab.json", "special_tokens_map.json", "tokenizer.py", ], ) path = Path(local_dir) with open(path / "tokenizer_config.json", "r", encoding="utf-8") as f: config = json.load(f) with open(path / "vocab.json", "r", encoding="utf-8") as f: vocab = json.load(f) tokenizer = cls( k=config.get("k", 1), model_max_length=config.get("model_max_length", 2048), pad_token=config.get("pad_token", "[PAD]"), unk_token=config.get("unk_token", "[UNK]"), sep_token=config.get("sep_token", "[SEP]"), cls_token=config.get("cls_token", "[CLS]"), mask_token=config.get("mask_token", "[MASK]"), bos_token=config.get("bos_token", "[BOS]"), eos_token=config.get("eos_token", "[EOS]"), **kwargs, ) tokenizer.vocab = vocab tokenizer.ids_to_tokens = {idx: token for token, idx in vocab.items()} return tokenizer