| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108 |
- package tokenizer
- import (
- "encoding/json"
- "os"
- "strings"
- )
- // JSON structures for HuggingFace tokenizer.json format
- type addedTokenJSON struct {
- ID int `json:"id"`
- Content string `json:"content"`
- Special bool `json:"special"`
- }
- type modelJSON struct {
- Type string `json:"type"`
- Vocab map[string]int `json:"vocab"`
- Merges json.RawMessage `json:"merges"` // Can be []string or [][]string
- }
- type tokenizerJSON struct {
- AddedTokens []addedTokenJSON `json:"added_tokens"`
- Model modelJSON `json:"model"`
- }
- // LoadFromJSON loads a tokenizer from HuggingFace tokenizer.json format
- func LoadFromJSON(path string) (*Tokenizer, error) {
- data, err := os.ReadFile(path)
- if err != nil {
- return nil, err
- }
- return LoadFromBytes(data)
- }
- // LoadFromBytes loads a tokenizer from raw JSON data
- func LoadFromBytes(data []byte) (*Tokenizer, error) {
- var tJSON tokenizerJSON
- if err := json.Unmarshal(data, &tJSON); err != nil {
- return nil, err
- }
- // Parse merges - can be []string or [][]string
- merges := parseMerges(tJSON.Model.Merges)
- byteEnc, byteDec := buildByteEncoder()
- t := &Tokenizer{
- vocab: tJSON.Model.Vocab,
- idToToken: make(map[int]string),
- merges: make(map[string]int),
- addedTokens: make(map[string]int),
- byteEncoder: byteEnc,
- byteDecoder: byteDec,
- }
- // Build id -> token map
- for k, v := range t.vocab {
- t.idToToken[v] = k
- }
- // Parse merges: "a b" format, index = priority (lower = higher priority)
- for i, m := range merges {
- t.merges[m] = i
- }
- // Added tokens (special)
- for _, at := range tJSON.AddedTokens {
- t.addedTokens[at.Content] = at.ID
- t.idToToken[at.ID] = at.Content
- if strings.Contains(at.Content, "endoftext") || strings.Contains(at.Content, "im_end") {
- t.eosID = at.ID
- }
- }
- // Compile pre-tokenizer pattern based on tokenizer model type
- t.prePattern = CompilePattern(DetectPattern(tJSON.Model.Type))
- return t, nil
- }
- // parseMerges handles both []string and [][]string merge formats
- func parseMerges(raw json.RawMessage) []string {
- if len(raw) == 0 {
- return nil
- }
- // Try []string first
- var merges []string
- if err := json.Unmarshal(raw, &merges); err == nil {
- return merges
- }
- // Try [][]string
- var mergePairs [][]string
- if err := json.Unmarshal(raw, &mergePairs); err == nil {
- merges = make([]string, 0, len(mergePairs))
- for _, pair := range mergePairs {
- if len(pair) == 2 {
- merges = append(merges, pair[0]+" "+pair[1])
- }
- }
- return merges
- }
- return nil
- }
|