diff --git a/bark.cpp b/bark.cpp index 79c6cc2..288a299 100644 --- a/bark.cpp +++ b/bark.cpp @@ -1,3 +1,21 @@ +/* +Port of Suno's Bark to C/C++. + +Author: Pierre-Antoine Bannier + +Note on tokenization +-------------------- +Even if bark relies on GPT to generate semantic tokens, the tokenizer is based on +Bert's multilingual cased tokenizer. This uses the WordPiece algorithm to split raw text +into tokens. + +This file contains an unofficial (Google has not released an official implementation of +WordPiece) implementation of WordPiece. + +Source: +https://github.com/skeskinen/bert.cpp/blob/master/bert.cpp + +*/ #include "bark.h" #include "ggml.h" #include "util.h" @@ -8,9 +26,10 @@ #include #include #include +#include #include -bool gpt_model_load(const std::string& fname, gpt_model & model) { +bool gpt_model_load(const std::string& fname, gpt_model& model, bark_vocab& vocab, bool has_vocab) { auto fin = std::ifstream(fname, std::ios::binary); if (!fin) { fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str()); @@ -50,9 +69,36 @@ bool gpt_model_load(const std::string& fname, gpt_model & model) { printf("%s: n_wtes = %d\n", __func__, hparams.n_wtes); } - // TODO: load vocab - { + if (has_vocab) { + int32_t n_vocab; + read_safe(fin, n_vocab); + + // 5 special tokens: [UNK, SEP, MASK, PAD, CLS] + if (n_vocab != model.hparams.n_in_vocab - model.hparams.n_out_vocab - 5) { + fprintf(stderr, "%s: wrong voculary size (%d != %d)\n", __func__, n_vocab, model.hparams.n_in_vocab); + return false; + } + + std::string word; + std::vector tmp; + + tmp.reserve(128); + for (int i = 0; i < n_vocab; i++) { + uint32_t len; + read_safe(fin, len); + + if (len > 0) { + tmp.resize(len); + fin.read(&tmp[0], tmp.size()); // read to buffer + word.assign(&tmp[0], tmp.size()); + } else { + word = ""; + } + + vocab.token_to_id[word] = i; + vocab.id_to_token[i] = word; + } } // for the big tensors, we have the option to store the data in 16-bit floats or quantized @@ -301,7 +347,7 @@ bool bark_model_load(std::string & dirname, bark_model & model) { { printf("%s: reading bark text model\n", __func__); const std::string fname = dirname + "/ggml_weights_text.bin"; - if(!gpt_model_load(fname, model.text_model)) { + if(!gpt_model_load(fname, model.text_model, model.vocab, true)) { fprintf(stderr, "%s: invalid model file '%s' (bad text)\n", __func__, fname.c_str()); return false; } @@ -312,7 +358,7 @@ bool bark_model_load(std::string & dirname, bark_model & model) { { printf("\n%s: reading bark coarse model\n", __func__); const std::string fname = dirname + "/ggml_weights_coarse.bin"; - if(!gpt_model_load(fname, model.coarse_model)) { + if(!gpt_model_load(fname, model.coarse_model, model.vocab, false)) { fprintf(stderr, "%s: invalid model file '%s' (bad coarse)\n", __func__, fname.c_str()); return false; } @@ -323,7 +369,7 @@ bool bark_model_load(std::string & dirname, bark_model & model) { { printf("\n%s: reading bark fine model\n", __func__); const std::string fname = dirname + "/ggml_weights_fine.bin"; - if(!gpt_model_load(fname, model.fine_model)) { + if(!gpt_model_load(fname, model.fine_model, model.vocab, false)) { fprintf(stderr, "%s: invalid model file '%s' (bad fine)\n", __func__, fname.c_str()); return false; } @@ -346,6 +392,99 @@ bool bark_model_load(std::string & dirname, bark_model & model) { return true; } +std::string bert_normalize_prompt(const std::string &text) { + std::string text2 = strip_accents(text); + for (size_t i = 0; i < text2.size(); i += utf8_len(text2[i])) { + char c = text2[i]; + if (c >= 'A' && c <= 'Z') + text2[i] = c - 'A' + 'a'; + } + return text2; +} + +void bert_tokenize( + const bark_vocab& vocab, + const char * text, + int32_t * tokens, + int32_t * n_tokens, + int32_t n_max_tokens) { + std::string str = text; + std::vector words; + + // first split the text into words + { + str = bert_normalize_prompt(str); + + std::string pat = R"([[:punct:]]|[[:alpha:]]+|[[:digit:]]+)"; + + std::regex re(pat); + std::smatch m; + + while (std::regex_search(str, m, re)) { + for (std::string x : m) + words.push_back(x); + str = m.suffix(); + } + } + + int32_t t = 0; + tokens[t++] = CLS_TOKEN_ID; + + // find the longest tokens that form the words: + for (const auto &word : words) { + if (word.size() == 0) + continue; + + int i = 0; + int n = word.size(); + auto *token_map = &vocab.token_to_id; + loop: + while (i < n) { + if (t >= n_max_tokens - 1) + break; + + int j = n; + while (j > i) { + auto it = token_map->find(word.substr(i, j - i)); + if (it != token_map->end()) { + tokens[t++] = it->second; + i = j; + token_map = &vocab.subword_token_to_id; + goto loop; + } + --j; + } + if (j == i) { + fprintf(stderr, "%s: unknown token '%s'\n", __func__, word.substr(i, 1).data()); + token_map = &vocab.subword_token_to_id; + ++i; + } + } + } + tokens[t++] = SEP_TOKEN_ID; + *n_tokens = t; +} + +void bark_generate_audio(bark_model model, const bark_vocab& vocab, const char * text) { + // tokenize text (bert tokenizer) + int32_t max_ctx_size = model.text_model.hparams.block_size; + int32_t n_tokens; + + std::vector tokens; + tokens.resize(max_ctx_size); + bert_tokenize(vocab, text, tokens.data(), &n_tokens, max_ctx_size); + + printf("\ntokens: "); + for (int i = 0; i < n_tokens; i++) { + printf("%d ", tokens[i]); + } + printf("\n"); + + // encode text + + // generate audio (encodec) +} + int main(int argc, char **argv) { const int64_t t_main_start_us = ggml_time_us(); @@ -372,6 +511,9 @@ int main(int argc, char **argv) { { const int64_t t_eval_us_start = ggml_time_us(); + // call to generate audio + bark_generate_audio(model, model.vocab, prompt.data()); + t_eval_us = ggml_time_us() - t_eval_us_start; } diff --git a/bark.h b/bark.h index e99b9eb..5b72699 100644 --- a/bark.h +++ b/bark.h @@ -3,6 +3,9 @@ #include #include +#define CLS_TOKEN_ID 101 +#define SEP_TOKEN_ID 102 + struct gpt_hparams { int32_t n_in_vocab; int32_t n_out_vocab; @@ -14,15 +17,18 @@ struct gpt_hparams { int32_t n_wtes; }; -struct gpt_vocab { +struct bark_vocab { using id = int32_t; using token = std::string; std::map token_to_id; std::map id_to_token; - std::vector special_tokens; - void add_special_token(const std::string & token); + std::map subword_token_to_id; + std::map id_to_subword_token; + + // std::vector special_tokens; + // void add_special_token(const std::string & token); }; struct gpt_layer { @@ -85,5 +91,8 @@ struct bark_model { // decoder encodec_model codec_model; + // vocab + bark_vocab vocab; + int32_t memsize = 0; }; \ No newline at end of file diff --git a/convert_pt_to_ggml.py b/convert_pt_to_ggml.py index c0967e6..4077456 100644 --- a/convert_pt_to_ggml.py +++ b/convert_pt_to_ggml.py @@ -26,21 +26,23 @@ python convert_pt_to_ggml.py \ --dir-model ~/.cache/suno/bark_v0 \ --codec-path ~/Documents/encodec.cpp/ggml_weights \ + --vocab-path ./ggml_weights/ \ --out-dir ./ggml_weights/ ``` """ import argparse from pathlib import Path -import json import re import struct import numpy as np import torch + parser = argparse.ArgumentParser() parser.add_argument("--dir-model", type=str, required=True) parser.add_argument("--codec-path", type=str, required=True) +parser.add_argument("--vocab-path", type=str, required=True) parser.add_argument("--out-dir", type=str, required=True) @@ -103,18 +105,18 @@ def parse_codec_model(checkpoint, out_dir): outfile.close() def parse_vocab(dir_model, outfile): - """Parse GPT vocabulary.""" - with open(dir_model / "vocab.json", "r", encoding="utf-8") as infile: - vocab = json.load(infile) + """Parse vocabulary.""" + # Even if bark relies on GPT to encode text, it relies on BertTokenizer (WordPiece) + with open(dir_model / "vocab.txt", "r", encoding="utf-8") as f: + vocab = f.readlines() - tokens = sorted(vocab.items(), key=lambda x: x[1]) - outfile.write(struct.pack("i", len(tokens))) - print("Vocab size:", len(tokens)) + outfile.write(struct.pack("i", len(vocab))) + print("Vocab size:", len(vocab)) - for token, _ in tokens: - text = bytearray(token, "utf-8") - outfile.write(struct.pack("i", len(text))) - outfile.write(text) + for token in vocab: + data = bytearray(token[:-1], "utf-8") # strip newline at the end + outfile.write(struct.pack("i", len(data))) + outfile.write(data) def parse_hparams(hparams, outfile): """Parse GPT hyperparameters.""" @@ -227,12 +229,14 @@ def parse_text_models(checkpoint, outfile): var_data.tofile(outfile) -def generate_file(in_file, out_dir): +def generate_file(in_file, out_dir, vocab_path=None): outfile = open(out_dir, "wb") outfile.write(struct.pack("i", 0x67676d6c)) # ggml magic - checkpoint = torch.load(in_file, map_location="cpu") + parse_hparams(checkpoint["model_args"], outfile) + if vocab_path: + parse_vocab(vocab_path, outfile) parse_text_models(checkpoint["model"], outfile) outfile.close() @@ -243,11 +247,12 @@ def generate_file(in_file, out_dir): dir_model = Path(args.dir_model) codec_path = Path(args.codec_path) + vocab_path = Path(args.vocab_path) out_dir = Path(args.out_dir) out_dir.mkdir(exist_ok=True, parents=True) - generate_file(dir_model / "text_2.pt", out_dir / "ggml_weights_text.bin") + generate_file(dir_model / "text_2.pt", out_dir / "ggml_weights_text.bin", vocab_path) print(" Text model loaded.") generate_file(dir_model / "coarse_2.pt", out_dir / "ggml_weights_coarse.bin") diff --git a/util.h b/util.h index 7205230..7a589dc 100644 --- a/util.h +++ b/util.h @@ -16,3 +16,38 @@ template static void read_safe(std::ifstream& infile, T& dest) { infile.read((char*)& dest, sizeof(T)); } + +static size_t utf8_len(char src) { + const size_t lookup[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4}; + uint8_t highbits = static_cast(src) >> 4; + return lookup[highbits]; +} + +std::string strip_accents(const std::string &in_str) { + std::string out_str; + std::map accent_map = {{"À", 'A'},{"Á", 'A'}, + {"Â", 'A'},{"Ã", 'A'},{"Ä", 'A'},{"Å", 'A'},{"à", 'a'},{"á", 'a'}, + {"â", 'a'},{"ã", 'a'},{"ä", 'a'},{"å", 'a'},{"È", 'E'},{"É", 'E'}, + {"Ê", 'E'},{"Ë", 'E'},{"è", 'e'},{"é", 'e'},{"ê", 'e'},{"ë", 'e'}, + {"Ì", 'I'},{"Í", 'I'},{"Î", 'I'},{"Ï", 'I'},{"ì", 'i'},{"í", 'i'}, + {"î", 'i'},{"ï", 'i'},{"Ò", 'O'},{"Ó", 'O'},{"Ô", 'O'},{"Õ", 'O'}, + {"Ö", 'O'},{"ò", 'o'},{"ó", 'o'},{"ô", 'o'},{"õ", 'o'},{"ö", 'o'}, + {"Ù", 'U'},{"Ú", 'U'},{"Û", 'U'},{"Ü", 'U'},{"ù", 'u'},{"ú", 'u'}, + {"û", 'u'},{"ü", 'u'},{"Ý", 'Y'},{"ý", 'y'},{"Ç", 'C'},{"ç", 'c'}, + {"Ñ", 'N'},{"ñ", 'n'}, + }; + + for (size_t i = 0; i < in_str.length();) { + int len = utf8_len(in_str[i]); + std::string cur = in_str.substr(i, len); + auto iter = accent_map.find(cur); + if (iter != accent_map.end()) + out_str += iter->second; + else + out_str += cur; + + i += len; + } + + return out_str; +} \ No newline at end of file