NucEL / tokenizer.py

Add NucEL_Tokenizer source for auto_map (matches GitHub repo)

5bde0cb verified about 1 month ago

6.51 kB

	"""NucEL k-mer tokenizer.

	This is the single source of truth for the NucEL tokenizer. It is identical to
	the ``tokenizer.py`` shipped with ``FreakingPotato/NucEL`` on the Hugging Face
	Hub. The vocabulary follows the layout reported in the paper (Section 3.3):

	7 special tokens + 4 nucleotides + 4**k k-mers + 16 reserved tokens

	For the published checkpoint, ``k = 1`` so the vocabulary size is::

	7 + 4 + 4 + 16 = 31 (vocab indices 0..30, model ``vocab_size`` is 27 +
	padding to the next embedding row).

	The discriminator's ``vocab_size`` in ``config.json`` is 27; the 4 trailing
	reserved slots exist on the tokenizer side but are not addressed by the model
	weights. They are reserved for future extensions to the alphabet.
	"""

	from itertools import product
	from pathlib import Path
	from typing import Any, Dict, List, Optional, Tuple, Union
	import json
	import os

	from transformers import PreTrainedTokenizer


	class NucEL_Tokenizer(PreTrainedTokenizer):
	"""k-mer tokenizer for DNA sequences used by NucEL."""

	model_input_names = ["input_ids", "attention_mask"]

	def __init__(
	self,
	k: int = 1,
	model_max_length: int = 2048,
	pad_token: str = "[PAD]",
	unk_token: str = "[UNK]",
	sep_token: str = "[SEP]",
	cls_token: str = "[CLS]",
	mask_token: str = "[MASK]",
	bos_token: str = "[BOS]",
	eos_token: str = "[EOS]",
	num_reserved_tokens: int = 16,
	**kwargs: Any,
	) -> None:
	self.k = k
	self.nucleotides = ["A", "C", "G", "T"]
	self.num_reserved_tokens = num_reserved_tokens

	self.special_tokens = {
	"pad_token": pad_token,
	"unk_token": unk_token,
	"sep_token": sep_token,
	"cls_token": cls_token,
	"mask_token": mask_token,
	"bos_token": bos_token,
	"eos_token": eos_token,
	}

	self._init_vocabulary()

	super().__init__(
	model_max_length=model_max_length,
	pad_token=pad_token,
	unk_token=unk_token,
	sep_token=sep_token,
	cls_token=cls_token,
	mask_token=mask_token,
	bos_token=bos_token,
	eos_token=eos_token,
	**kwargs,
	)

	def _init_vocabulary(self) -> None:
	special_tokens = [
	self.special_tokens["pad_token"],
	self.special_tokens["unk_token"],
	self.special_tokens["cls_token"],
	self.special_tokens["sep_token"],
	self.special_tokens["mask_token"],
	self.special_tokens["bos_token"],
	self.special_tokens["eos_token"],
	]
	kmers = ["".join(p) for p in product(self.nucleotides, repeat=self.k)]
	reserved = [f"[RESERVED_{i}]" for i in range(self.num_reserved_tokens)]

	all_tokens = special_tokens + self.nucleotides + kmers + reserved
	self.vocab = {token: idx for idx, token in enumerate(all_tokens)}
	self.ids_to_tokens = {idx: token for token, idx in self.vocab.items()}

	@property
	def vocab_size(self) -> int:
	return len(self.vocab)

	def get_vocab(self) -> Dict[str, int]:
	return self.vocab.copy()

	def _tokenize(self, text: str) -> List[str]:
	text = text.upper().strip()
	tokens = [self.cls_token]
	i = 0
	while i < len(text):
	if i <= len(text) - self.k:
	kmer = text[i : i + self.k]
	if kmer in self.vocab:
	tokens.append(kmer)
	i += self.k
	continue
	if i < len(text):
	nt = text[i]
	tokens.append(nt if nt in self.nucleotides else self.unk_token)
	i += 1
	return tokens

	def _convert_token_to_id(self, token: str) -> int:
	return self.vocab.get(token, self.vocab[self.unk_token])

	def _convert_id_to_token(self, index: int) -> str:
	return self.ids_to_tokens.get(index, self.unk_token)

	def save_vocabulary(
	self,
	save_directory: str,
	filename_prefix: Optional[str] = None,
	) -> Tuple[str]:
	prefix = filename_prefix or "vocab"
	vocab_file = os.path.join(save_directory, f"{prefix}.json")
	with open(vocab_file, "w", encoding="utf-8") as f:
	json.dump(self.vocab, f, ensure_ascii=False, indent=2)
	return (vocab_file,)

	def save_pretrained(
	self,
	save_directory: str,
	legacy_format: bool = True,
	filename_prefix: Optional[str] = None,
	**kwargs: Any,
	):
	vocab_files = self.save_vocabulary(save_directory, filename_prefix=filename_prefix)
	return super().save_pretrained(
	save_directory,
	legacy_format=legacy_format,
	**kwargs,
	) or vocab_files

	@classmethod
	def from_pretrained(
	cls,
	pretrained_model_name_or_path: Union[str, os.PathLike],
	*init_inputs: Any,
	**kwargs: Any,
	) -> "NucEL_Tokenizer":
	path = Path(pretrained_model_name_or_path)
	if not path.is_dir():
	from huggingface_hub import snapshot_download

	local_dir = snapshot_download(
	repo_id=str(pretrained_model_name_or_path),
	allow_patterns=[
	"tokenizer_config.json",
	"vocab.json",
	"special_tokens_map.json",
	"tokenizer.py",
	],
	)
	path = Path(local_dir)

	with open(path / "tokenizer_config.json", "r", encoding="utf-8") as f:
	config = json.load(f)
	with open(path / "vocab.json", "r", encoding="utf-8") as f:
	vocab = json.load(f)

	tokenizer = cls(
	k=config.get("k", 1),
	model_max_length=config.get("model_max_length", 2048),
	pad_token=config.get("pad_token", "[PAD]"),
	unk_token=config.get("unk_token", "[UNK]"),
	sep_token=config.get("sep_token", "[SEP]"),
	cls_token=config.get("cls_token", "[CLS]"),
	mask_token=config.get("mask_token", "[MASK]"),
	bos_token=config.get("bos_token", "[BOS]"),
	eos_token=config.get("eos_token", "[EOS]"),
	**kwargs,
	)
	tokenizer.vocab = vocab
	tokenizer.ids_to_tokens = {idx: token for token, idx in vocab.items()}
	return tokenizer