Feature Extraction
Transformers
PyTorch
Safetensors
English
modernbert
genomics
nucleotide
dna
sequence-modeling
biology
bioinformatics
electra
Instructions to use FreakingPotato/NucEL with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use FreakingPotato/NucEL with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("feature-extraction", model="FreakingPotato/NucEL")# Load model directly from transformers import AutoTokenizer, AutoModel tokenizer = AutoTokenizer.from_pretrained("FreakingPotato/NucEL") model = AutoModel.from_pretrained("FreakingPotato/NucEL") - Notebooks
- Google Colab
- Kaggle
| """NucEL k-mer tokenizer. | |
| This is the single source of truth for the NucEL tokenizer. It is identical to | |
| the ``tokenizer.py`` shipped with ``FreakingPotato/NucEL`` on the Hugging Face | |
| Hub. The vocabulary follows the layout reported in the paper (Section 3.3): | |
| 7 special tokens + 4 nucleotides + 4**k k-mers + 16 reserved tokens | |
| For the published checkpoint, ``k = 1`` so the vocabulary size is:: | |
| 7 + 4 + 4 + 16 = 31 (vocab indices 0..30, model ``vocab_size`` is 27 + | |
| padding to the next embedding row). | |
| The discriminator's ``vocab_size`` in ``config.json`` is 27; the 4 trailing | |
| reserved slots exist on the tokenizer side but are not addressed by the model | |
| weights. They are reserved for future extensions to the alphabet. | |
| """ | |
| from itertools import product | |
| from pathlib import Path | |
| from typing import Any, Dict, List, Optional, Tuple, Union | |
| import json | |
| import os | |
| from transformers import PreTrainedTokenizer | |
| class NucEL_Tokenizer(PreTrainedTokenizer): | |
| """k-mer tokenizer for DNA sequences used by NucEL.""" | |
| model_input_names = ["input_ids", "attention_mask"] | |
| def __init__( | |
| self, | |
| k: int = 1, | |
| model_max_length: int = 2048, | |
| pad_token: str = "[PAD]", | |
| unk_token: str = "[UNK]", | |
| sep_token: str = "[SEP]", | |
| cls_token: str = "[CLS]", | |
| mask_token: str = "[MASK]", | |
| bos_token: str = "[BOS]", | |
| eos_token: str = "[EOS]", | |
| num_reserved_tokens: int = 16, | |
| **kwargs: Any, | |
| ) -> None: | |
| self.k = k | |
| self.nucleotides = ["A", "C", "G", "T"] | |
| self.num_reserved_tokens = num_reserved_tokens | |
| self.special_tokens = { | |
| "pad_token": pad_token, | |
| "unk_token": unk_token, | |
| "sep_token": sep_token, | |
| "cls_token": cls_token, | |
| "mask_token": mask_token, | |
| "bos_token": bos_token, | |
| "eos_token": eos_token, | |
| } | |
| self._init_vocabulary() | |
| super().__init__( | |
| model_max_length=model_max_length, | |
| pad_token=pad_token, | |
| unk_token=unk_token, | |
| sep_token=sep_token, | |
| cls_token=cls_token, | |
| mask_token=mask_token, | |
| bos_token=bos_token, | |
| eos_token=eos_token, | |
| **kwargs, | |
| ) | |
| def _init_vocabulary(self) -> None: | |
| special_tokens = [ | |
| self.special_tokens["pad_token"], | |
| self.special_tokens["unk_token"], | |
| self.special_tokens["cls_token"], | |
| self.special_tokens["sep_token"], | |
| self.special_tokens["mask_token"], | |
| self.special_tokens["bos_token"], | |
| self.special_tokens["eos_token"], | |
| ] | |
| kmers = ["".join(p) for p in product(self.nucleotides, repeat=self.k)] | |
| reserved = [f"[RESERVED_{i}]" for i in range(self.num_reserved_tokens)] | |
| all_tokens = special_tokens + self.nucleotides + kmers + reserved | |
| self.vocab = {token: idx for idx, token in enumerate(all_tokens)} | |
| self.ids_to_tokens = {idx: token for token, idx in self.vocab.items()} | |
| def vocab_size(self) -> int: | |
| return len(self.vocab) | |
| def get_vocab(self) -> Dict[str, int]: | |
| return self.vocab.copy() | |
| def _tokenize(self, text: str) -> List[str]: | |
| text = text.upper().strip() | |
| tokens = [self.cls_token] | |
| i = 0 | |
| while i < len(text): | |
| if i <= len(text) - self.k: | |
| kmer = text[i : i + self.k] | |
| if kmer in self.vocab: | |
| tokens.append(kmer) | |
| i += self.k | |
| continue | |
| if i < len(text): | |
| nt = text[i] | |
| tokens.append(nt if nt in self.nucleotides else self.unk_token) | |
| i += 1 | |
| return tokens | |
| def _convert_token_to_id(self, token: str) -> int: | |
| return self.vocab.get(token, self.vocab[self.unk_token]) | |
| def _convert_id_to_token(self, index: int) -> str: | |
| return self.ids_to_tokens.get(index, self.unk_token) | |
| def save_vocabulary( | |
| self, | |
| save_directory: str, | |
| filename_prefix: Optional[str] = None, | |
| ) -> Tuple[str]: | |
| prefix = filename_prefix or "vocab" | |
| vocab_file = os.path.join(save_directory, f"{prefix}.json") | |
| with open(vocab_file, "w", encoding="utf-8") as f: | |
| json.dump(self.vocab, f, ensure_ascii=False, indent=2) | |
| return (vocab_file,) | |
| def save_pretrained( | |
| self, | |
| save_directory: str, | |
| legacy_format: bool = True, | |
| filename_prefix: Optional[str] = None, | |
| **kwargs: Any, | |
| ): | |
| vocab_files = self.save_vocabulary(save_directory, filename_prefix=filename_prefix) | |
| return super().save_pretrained( | |
| save_directory, | |
| legacy_format=legacy_format, | |
| **kwargs, | |
| ) or vocab_files | |
| def from_pretrained( | |
| cls, | |
| pretrained_model_name_or_path: Union[str, os.PathLike], | |
| *init_inputs: Any, | |
| **kwargs: Any, | |
| ) -> "NucEL_Tokenizer": | |
| path = Path(pretrained_model_name_or_path) | |
| if not path.is_dir(): | |
| from huggingface_hub import snapshot_download | |
| local_dir = snapshot_download( | |
| repo_id=str(pretrained_model_name_or_path), | |
| allow_patterns=[ | |
| "tokenizer_config.json", | |
| "vocab.json", | |
| "special_tokens_map.json", | |
| "tokenizer.py", | |
| ], | |
| ) | |
| path = Path(local_dir) | |
| with open(path / "tokenizer_config.json", "r", encoding="utf-8") as f: | |
| config = json.load(f) | |
| with open(path / "vocab.json", "r", encoding="utf-8") as f: | |
| vocab = json.load(f) | |
| tokenizer = cls( | |
| k=config.get("k", 1), | |
| model_max_length=config.get("model_max_length", 2048), | |
| pad_token=config.get("pad_token", "[PAD]"), | |
| unk_token=config.get("unk_token", "[UNK]"), | |
| sep_token=config.get("sep_token", "[SEP]"), | |
| cls_token=config.get("cls_token", "[CLS]"), | |
| mask_token=config.get("mask_token", "[MASK]"), | |
| bos_token=config.get("bos_token", "[BOS]"), | |
| eos_token=config.get("eos_token", "[EOS]"), | |
| **kwargs, | |
| ) | |
| tokenizer.vocab = vocab | |
| tokenizer.ids_to_tokens = {idx: token for token, idx in vocab.items()} | |
| return tokenizer | |