NucEL / tokenizer.py
FreakingPotato's picture
Add NucEL_Tokenizer source for auto_map (matches GitHub repo)
5bde0cb verified
Raw
History Blame Contribute Delete
6.51 kB
"""NucEL k-mer tokenizer.
This is the single source of truth for the NucEL tokenizer. It is identical to
the ``tokenizer.py`` shipped with ``FreakingPotato/NucEL`` on the Hugging Face
Hub. The vocabulary follows the layout reported in the paper (Section 3.3):
7 special tokens + 4 nucleotides + 4**k k-mers + 16 reserved tokens
For the published checkpoint, ``k = 1`` so the vocabulary size is::
7 + 4 + 4 + 16 = 31 (vocab indices 0..30, model ``vocab_size`` is 27 +
padding to the next embedding row).
The discriminator's ``vocab_size`` in ``config.json`` is 27; the 4 trailing
reserved slots exist on the tokenizer side but are not addressed by the model
weights. They are reserved for future extensions to the alphabet.
"""
from itertools import product
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union
import json
import os
from transformers import PreTrainedTokenizer
class NucEL_Tokenizer(PreTrainedTokenizer):
"""k-mer tokenizer for DNA sequences used by NucEL."""
model_input_names = ["input_ids", "attention_mask"]
def __init__(
self,
k: int = 1,
model_max_length: int = 2048,
pad_token: str = "[PAD]",
unk_token: str = "[UNK]",
sep_token: str = "[SEP]",
cls_token: str = "[CLS]",
mask_token: str = "[MASK]",
bos_token: str = "[BOS]",
eos_token: str = "[EOS]",
num_reserved_tokens: int = 16,
**kwargs: Any,
) -> None:
self.k = k
self.nucleotides = ["A", "C", "G", "T"]
self.num_reserved_tokens = num_reserved_tokens
self.special_tokens = {
"pad_token": pad_token,
"unk_token": unk_token,
"sep_token": sep_token,
"cls_token": cls_token,
"mask_token": mask_token,
"bos_token": bos_token,
"eos_token": eos_token,
}
self._init_vocabulary()
super().__init__(
model_max_length=model_max_length,
pad_token=pad_token,
unk_token=unk_token,
sep_token=sep_token,
cls_token=cls_token,
mask_token=mask_token,
bos_token=bos_token,
eos_token=eos_token,
**kwargs,
)
def _init_vocabulary(self) -> None:
special_tokens = [
self.special_tokens["pad_token"],
self.special_tokens["unk_token"],
self.special_tokens["cls_token"],
self.special_tokens["sep_token"],
self.special_tokens["mask_token"],
self.special_tokens["bos_token"],
self.special_tokens["eos_token"],
]
kmers = ["".join(p) for p in product(self.nucleotides, repeat=self.k)]
reserved = [f"[RESERVED_{i}]" for i in range(self.num_reserved_tokens)]
all_tokens = special_tokens + self.nucleotides + kmers + reserved
self.vocab = {token: idx for idx, token in enumerate(all_tokens)}
self.ids_to_tokens = {idx: token for token, idx in self.vocab.items()}
@property
def vocab_size(self) -> int:
return len(self.vocab)
def get_vocab(self) -> Dict[str, int]:
return self.vocab.copy()
def _tokenize(self, text: str) -> List[str]:
text = text.upper().strip()
tokens = [self.cls_token]
i = 0
while i < len(text):
if i <= len(text) - self.k:
kmer = text[i : i + self.k]
if kmer in self.vocab:
tokens.append(kmer)
i += self.k
continue
if i < len(text):
nt = text[i]
tokens.append(nt if nt in self.nucleotides else self.unk_token)
i += 1
return tokens
def _convert_token_to_id(self, token: str) -> int:
return self.vocab.get(token, self.vocab[self.unk_token])
def _convert_id_to_token(self, index: int) -> str:
return self.ids_to_tokens.get(index, self.unk_token)
def save_vocabulary(
self,
save_directory: str,
filename_prefix: Optional[str] = None,
) -> Tuple[str]:
prefix = filename_prefix or "vocab"
vocab_file = os.path.join(save_directory, f"{prefix}.json")
with open(vocab_file, "w", encoding="utf-8") as f:
json.dump(self.vocab, f, ensure_ascii=False, indent=2)
return (vocab_file,)
def save_pretrained(
self,
save_directory: str,
legacy_format: bool = True,
filename_prefix: Optional[str] = None,
**kwargs: Any,
):
vocab_files = self.save_vocabulary(save_directory, filename_prefix=filename_prefix)
return super().save_pretrained(
save_directory,
legacy_format=legacy_format,
**kwargs,
) or vocab_files
@classmethod
def from_pretrained(
cls,
pretrained_model_name_or_path: Union[str, os.PathLike],
*init_inputs: Any,
**kwargs: Any,
) -> "NucEL_Tokenizer":
path = Path(pretrained_model_name_or_path)
if not path.is_dir():
from huggingface_hub import snapshot_download
local_dir = snapshot_download(
repo_id=str(pretrained_model_name_or_path),
allow_patterns=[
"tokenizer_config.json",
"vocab.json",
"special_tokens_map.json",
"tokenizer.py",
],
)
path = Path(local_dir)
with open(path / "tokenizer_config.json", "r", encoding="utf-8") as f:
config = json.load(f)
with open(path / "vocab.json", "r", encoding="utf-8") as f:
vocab = json.load(f)
tokenizer = cls(
k=config.get("k", 1),
model_max_length=config.get("model_max_length", 2048),
pad_token=config.get("pad_token", "[PAD]"),
unk_token=config.get("unk_token", "[UNK]"),
sep_token=config.get("sep_token", "[SEP]"),
cls_token=config.get("cls_token", "[CLS]"),
mask_token=config.get("mask_token", "[MASK]"),
bos_token=config.get("bos_token", "[BOS]"),
eos_token=config.get("eos_token", "[EOS]"),
**kwargs,
)
tokenizer.vocab = vocab
tokenizer.ids_to_tokens = {idx: token for token, idx in vocab.items()}
return tokenizer