PABannier · PABannier · Jul 9, 2023 · Jul 9, 2023
diff --git a/bark.cpp b/bark.cpp
@@ -1,3 +1,21 @@
+/*
+Port of Suno's Bark to C/C++.
+
+Author: Pierre-Antoine Bannier<pierreantoine.bannier@gmail.com>
+
+Note on tokenization
+--------------------
+Even if bark relies on GPT to generate semantic tokens, the tokenizer is based on
+Bert's multilingual cased tokenizer. This uses the WordPiece algorithm to split raw text
+into tokens.
+
+This file contains an unofficial (Google has not released an official implementation of
+WordPiece) implementation of WordPiece.
+
+Source:
+https://github.com/skeskinen/bert.cpp/blob/master/bert.cpp
+
+*/
 #include "bark.h"
 #include "ggml.h"
 #include "util.h"
@@ -8,9 +26,10 @@
 #include <cstring>
 #include <fstream>
 #include <map>
+#include <regex>
 #include <string>
 
-bool gpt_model_load(const std::string& fname, gpt_model & model) {
+bool gpt_model_load(const std::string& fname, gpt_model& model, bark_vocab& vocab, bool has_vocab) {
     auto fin = std::ifstream(fname, std::ios::binary);
     if (!fin) {
         fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
@@ -50,9 +69,36 @@ bool gpt_model_load(const std::string& fname, gpt_model & model) {
         printf("%s: n_wtes      = %d\n", __func__, hparams.n_wtes);
     }
 
-    // TODO: load vocab
-    {
+    if (has_vocab) {
+        int32_t n_vocab;
+        read_safe(fin, n_vocab);
+
+        // 5 special tokens: [UNK, SEP, MASK, PAD, CLS]
+        if (n_vocab != model.hparams.n_in_vocab - model.hparams.n_out_vocab - 5) {
+            fprintf(stderr, "%s: wrong voculary size (%d != %d)\n", __func__, n_vocab, model.hparams.n_in_vocab);
+            return false;
+        }
+
+        std::string word;
+        std::vector<char> tmp;
+
+        tmp.reserve(128);
 
+        for (int i = 0; i < n_vocab; i++) {
+            uint32_t len;
+            read_safe(fin, len);
+
+            if (len > 0) {
+                tmp.resize(len);
+                fin.read(&tmp[0], tmp.size()); // read to buffer
+                word.assign(&tmp[0], tmp.size());
+            } else {
+                word = "";
+            }
+
+            vocab.token_to_id[word] = i;
+            vocab.id_to_token[i] = word;
+        }
     }
 
     // for the big tensors, we have the option to store the data in 16-bit floats or quantized
@@ -301,7 +347,7 @@ bool bark_model_load(std::string & dirname, bark_model & model) {
     {
         printf("%s: reading bark text model\n", __func__);
         const std::string fname = dirname + "/ggml_weights_text.bin";
-        if(!gpt_model_load(fname, model.text_model)) {
+        if(!gpt_model_load(fname, model.text_model, model.vocab, true)) {
             fprintf(stderr, "%s: invalid model file '%s' (bad text)\n", __func__, fname.c_str());
             return false;
         }
@@ -312,7 +358,7 @@ bool bark_model_load(std::string & dirname, bark_model & model) {
     {
         printf("\n%s: reading bark coarse model\n", __func__);
         const std::string fname = dirname + "/ggml_weights_coarse.bin";
-        if(!gpt_model_load(fname, model.coarse_model)) {
+        if(!gpt_model_load(fname, model.coarse_model, model.vocab, false)) {
             fprintf(stderr, "%s: invalid model file '%s' (bad coarse)\n", __func__, fname.c_str());
             return false;
         }
@@ -323,7 +369,7 @@ bool bark_model_load(std::string & dirname, bark_model & model) {
     {
         printf("\n%s: reading bark fine model\n", __func__);
         const std::string fname = dirname + "/ggml_weights_fine.bin";
-        if(!gpt_model_load(fname, model.fine_model)) {
+        if(!gpt_model_load(fname, model.fine_model, model.vocab, false)) {
             fprintf(stderr, "%s: invalid model file '%s' (bad fine)\n", __func__, fname.c_str());
             return false;
         }
@@ -346,6 +392,99 @@ bool bark_model_load(std::string & dirname, bark_model & model) {
     return true;
 }
 
+std::string bert_normalize_prompt(const std::string &text) {
+    std::string text2 = strip_accents(text);
+    for (size_t i = 0; i < text2.size(); i += utf8_len(text2[i])) {
+        char c = text2[i];
+        if (c >= 'A' && c <= 'Z')
+            text2[i] = c - 'A' + 'a';
+    }
+    return text2;
+}
+
+void bert_tokenize(
+    const bark_vocab& vocab,
+    const char * text,
+    int32_t * tokens,
+    int32_t * n_tokens,
+    int32_t n_max_tokens) {
+    std::string str = text;
+    std::vector<std::string> words;
+
+    // first split the text into words
+    {
+        str = bert_normalize_prompt(str);
+
+        std::string pat = R"([[:punct:]]|[[:alpha:]]+|[[:digit:]]+)";
+
+        std::regex re(pat);
+        std::smatch m;
+
+        while (std::regex_search(str, m, re)) {
+            for (std::string x : m)
+                words.push_back(x);
+            str = m.suffix();
+        }
+    }
+
+    int32_t t = 0;
+    tokens[t++] = CLS_TOKEN_ID;
+
+    // find the longest tokens that form the words:
+    for (const auto &word : words) {
+        if (word.size() == 0)
+            continue;
+
+        int i = 0;
+        int n = word.size();
+        auto *token_map = &vocab.token_to_id;
+    loop:
+        while (i < n) {
+            if (t >= n_max_tokens - 1)
+                break;
+
+            int j = n;
+            while (j > i) {
+                auto it = token_map->find(word.substr(i, j - i));
+                if (it != token_map->end()) {
+                    tokens[t++] = it->second;
+                    i = j;
+                    token_map = &vocab.subword_token_to_id;
+                    goto loop;
+                }
+                --j;
+            }
+            if (j == i) {
+                fprintf(stderr, "%s: unknown token '%s'\n", __func__, word.substr(i, 1).data());
+                token_map = &vocab.subword_token_to_id;
+                ++i;
+            }
+        }
+    }
+    tokens[t++] = SEP_TOKEN_ID;
+    *n_tokens = t;
+}
+
+void bark_generate_audio(bark_model model, const bark_vocab& vocab, const char * text) {
+    // tokenize text (bert tokenizer)
+    int32_t max_ctx_size = model.text_model.hparams.block_size;
+    int32_t n_tokens;
+
+    std::vector<bark_vocab::id> tokens;
+    tokens.resize(max_ctx_size);
+    bert_tokenize(vocab, text, tokens.data(), &n_tokens, max_ctx_size);
+
+    printf("\ntokens: ");
+    for (int i = 0; i < n_tokens; i++) {
+        printf("%d ", tokens[i]);
+    }
+    printf("\n");
+
+    // encode text
+
+    // generate audio (encodec)
+}
+
 int main(int argc, char **argv) {
     const int64_t t_main_start_us = ggml_time_us();
 
@@ -372,6 +511,9 @@ int main(int argc, char **argv) {
     {
         const int64_t t_eval_us_start = ggml_time_us();
 
+        // call to generate audio
+        bark_generate_audio(model, model.vocab, prompt.data());
+
         t_eval_us = ggml_time_us() - t_eval_us_start;
     }
 

diff --git a/bark.h b/bark.h
@@ -3,6 +3,9 @@
 #include <map>
 #include <vector>
 
+#define CLS_TOKEN_ID 101
+#define SEP_TOKEN_ID 102
+
 struct gpt_hparams {
     int32_t n_in_vocab;
     int32_t n_out_vocab;
@@ -14,15 +17,18 @@ struct gpt_hparams {
     int32_t n_wtes;
 };
 
-struct gpt_vocab {
+struct bark_vocab {
     using id    = int32_t;
     using token = std::string;
 
     std::map<token, id> token_to_id;
     std::map<id, token> id_to_token;
-    std::vector<std::string> special_tokens;
 
-    void add_special_token(const std::string & token);
+    std::map<token, id> subword_token_to_id;
+    std::map<id, token> id_to_subword_token;
+
+    // std::vector<std::string> special_tokens;
+    // void add_special_token(const std::string & token);
 };
 
 struct gpt_layer {
@@ -85,5 +91,8 @@ struct bark_model {
     // decoder
     encodec_model codec_model;
 
+    // vocab
+    bark_vocab vocab;
+
     int32_t memsize = 0;
 };
diff --git a/convert_pt_to_ggml.py b/convert_pt_to_ggml.py
@@ -26,21 +26,23 @@
     python convert_pt_to_ggml.py \
         --dir-model ~/.cache/suno/bark_v0 \
         --codec-path ~/Documents/encodec.cpp/ggml_weights \
+        --vocab-path ./ggml_weights/ \
         --out-dir ./ggml_weights/
 ```
 """
 import argparse
 from pathlib import Path
-import json
 import re
 import struct
 
 import numpy as np
 import torch
 
+
 parser = argparse.ArgumentParser()
 parser.add_argument("--dir-model", type=str, required=True)
 parser.add_argument("--codec-path", type=str, required=True)
+parser.add_argument("--vocab-path", type=str, required=True)
 parser.add_argument("--out-dir", type=str, required=True)
 
 
@@ -103,18 +105,18 @@ def parse_codec_model(checkpoint, out_dir):
     outfile.close()
 
 def parse_vocab(dir_model, outfile):
-    """Parse GPT vocabulary."""
-    with open(dir_model / "vocab.json", "r", encoding="utf-8") as infile:
-        vocab = json.load(infile)
+    """Parse vocabulary."""
+    # Even if bark relies on GPT to encode text, it relies on BertTokenizer (WordPiece)
+    with open(dir_model / "vocab.txt", "r", encoding="utf-8") as f:
+        vocab = f.readlines()
 
-    tokens = sorted(vocab.items(), key=lambda x: x[1])
-    outfile.write(struct.pack("i", len(tokens)))
-    print("Vocab size:", len(tokens))
+    outfile.write(struct.pack("i", len(vocab)))
+    print("Vocab size:", len(vocab))
 
-    for token, _ in tokens:
-        text = bytearray(token, "utf-8")
-        outfile.write(struct.pack("i", len(text)))
-        outfile.write(text)
+    for token in vocab:
+        data = bytearray(token[:-1], "utf-8")  # strip newline at the end
+        outfile.write(struct.pack("i", len(data)))
+        outfile.write(data)
 
 def parse_hparams(hparams, outfile):
     """Parse GPT hyperparameters."""
@@ -227,12 +229,14 @@ def parse_text_models(checkpoint, outfile):
 
         var_data.tofile(outfile)
 
-def generate_file(in_file, out_dir):
+def generate_file(in_file, out_dir, vocab_path=None):
     outfile = open(out_dir, "wb")
     outfile.write(struct.pack("i", 0x67676d6c))  # ggml magic
-
     checkpoint = torch.load(in_file, map_location="cpu")
+
     parse_hparams(checkpoint["model_args"], outfile)
+    if vocab_path:
+        parse_vocab(vocab_path, outfile)
     parse_text_models(checkpoint["model"], outfile)
 
     outfile.close()
@@ -243,11 +247,12 @@ def generate_file(in_file, out_dir):
 
     dir_model = Path(args.dir_model)
     codec_path = Path(args.codec_path)
+    vocab_path = Path(args.vocab_path)
 
     out_dir = Path(args.out_dir)
     out_dir.mkdir(exist_ok=True, parents=True)
 
-    generate_file(dir_model / "text_2.pt", out_dir / "ggml_weights_text.bin")
+    generate_file(dir_model / "text_2.pt", out_dir / "ggml_weights_text.bin", vocab_path)
     print(" Text model loaded.")
 
     generate_file(dir_model / "coarse_2.pt", out_dir / "ggml_weights_coarse.bin")

diff --git a/util.h b/util.h
@@ -16,3 +16,38 @@ template<typename T>
 static void read_safe(std::ifstream& infile, T& dest) {
     infile.read((char*)& dest, sizeof(T));
 }
+
+static size_t utf8_len(char src) {
+    const size_t lookup[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4};
+    uint8_t highbits = static_cast<uint8_t>(src) >> 4;
+    return lookup[highbits];
+}
+
+std::string strip_accents(const std::string &in_str) {
+    std::string out_str;
+    std::map<std::string, char> accent_map = {{"À", 'A'},{"Á", 'A'},
+        {"Â", 'A'},{"Ã", 'A'},{"Ä", 'A'},{"Å", 'A'},{"à", 'a'},{"á", 'a'},
+        {"â", 'a'},{"ã", 'a'},{"ä", 'a'},{"å", 'a'},{"È", 'E'},{"É", 'E'},
+        {"Ê", 'E'},{"Ë", 'E'},{"è", 'e'},{"é", 'e'},{"ê", 'e'},{"ë", 'e'},
+        {"Ì", 'I'},{"Í", 'I'},{"Î", 'I'},{"Ï", 'I'},{"ì", 'i'},{"í", 'i'},
+        {"î", 'i'},{"ï", 'i'},{"Ò", 'O'},{"Ó", 'O'},{"Ô", 'O'},{"Õ", 'O'},
+        {"Ö", 'O'},{"ò", 'o'},{"ó", 'o'},{"ô", 'o'},{"õ", 'o'},{"ö", 'o'},
+        {"Ù", 'U'},{"Ú", 'U'},{"Û", 'U'},{"Ü", 'U'},{"ù", 'u'},{"ú", 'u'},
+        {"û", 'u'},{"ü", 'u'},{"Ý", 'Y'},{"ý", 'y'},{"Ç", 'C'},{"ç", 'c'},
+        {"Ñ", 'N'},{"ñ", 'n'},
+    };
+
+    for (size_t i = 0; i < in_str.length();) {
+        int len = utf8_len(in_str[i]);
+        std::string cur = in_str.substr(i, len);
+        auto iter = accent_map.find(cur);
+        if (iter != accent_map.end())
+            out_str += iter->second;
+        else
+            out_str += cur;
+
+        i += len;
+    }
+
+    return out_str;
+}