package tokenizer import ( "encoding/json" "os" "strings" ) // JSON structures for HuggingFace tokenizer.json format type addedTokenJSON struct { ID int `json:"id"` Content string `json:"content"` Special bool `json:"special"` } type modelJSON struct { Type string `json:"type"` Vocab map[string]int `json:"vocab"` Merges json.RawMessage `json:"merges"` // Can be []string or [][]string } type tokenizerJSON struct { AddedTokens []addedTokenJSON `json:"added_tokens"` Model modelJSON `json:"model"` } // LoadFromJSON loads a tokenizer from HuggingFace tokenizer.json format func LoadFromJSON(path string) (*Tokenizer, error) { data, err := os.ReadFile(path) if err != nil { return nil, err } return LoadFromBytes(data) } // LoadFromBytes loads a tokenizer from raw JSON data func LoadFromBytes(data []byte) (*Tokenizer, error) { var tJSON tokenizerJSON if err := json.Unmarshal(data, &tJSON); err != nil { return nil, err } // Parse merges - can be []string or [][]string merges := parseMerges(tJSON.Model.Merges) byteEnc, byteDec := buildByteEncoder() t := &Tokenizer{ vocab: tJSON.Model.Vocab, idToToken: make(map[int]string), merges: make(map[string]int), addedTokens: make(map[string]int), byteEncoder: byteEnc, byteDecoder: byteDec, } // Build id -> token map for k, v := range t.vocab { t.idToToken[v] = k } // Parse merges: "a b" format, index = priority (lower = higher priority) for i, m := range merges { t.merges[m] = i } // Added tokens (special) for _, at := range tJSON.AddedTokens { t.addedTokens[at.Content] = at.ID t.idToToken[at.ID] = at.Content if strings.Contains(at.Content, "endoftext") || strings.Contains(at.Content, "im_end") { t.eosID = at.ID } } // Compile pre-tokenizer pattern based on tokenizer model type t.prePattern = CompilePattern(DetectPattern(tJSON.Model.Type)) return t, nil } // parseMerges handles both []string and [][]string merge formats func parseMerges(raw json.RawMessage) []string { if len(raw) == 0 { return nil } // Try []string first var merges []string if err := json.Unmarshal(raw, &merges); err == nil { return merges } // Try [][]string var mergePairs [][]string if err := json.Unmarshal(raw, &mergePairs); err == nil { merges = make([]string, 0, len(mergePairs)) for _, pair := range mergePairs { if len(pair) == 2 { merges = append(merges, pair[0]+" "+pair[1]) } } return merges } return nil }