Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
154 changes: 148 additions & 6 deletions bark.cpp
Original file line number Diff line number Diff line change
@@ -1,3 +1,21 @@
/*
Port of Suno's Bark to C/C++.

Author: Pierre-Antoine Bannier<pierreantoine.bannier@gmail.com>

Note on tokenization
--------------------
Even if bark relies on GPT to generate semantic tokens, the tokenizer is based on
Bert's multilingual cased tokenizer. This uses the WordPiece algorithm to split raw text
into tokens.

This file contains an unofficial (Google has not released an official implementation of
WordPiece) implementation of WordPiece.

Source:
https://github.com/skeskinen/bert.cpp/blob/master/bert.cpp

*/
#include "bark.h"
#include "ggml.h"
#include "util.h"
Expand All @@ -8,9 +26,10 @@
#include <cstring>
#include <fstream>
#include <map>
#include <regex>
#include <string>

bool gpt_model_load(const std::string& fname, gpt_model & model) {
bool gpt_model_load(const std::string& fname, gpt_model& model, bark_vocab& vocab, bool has_vocab) {
auto fin = std::ifstream(fname, std::ios::binary);
if (!fin) {
fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
Expand Down Expand Up @@ -50,9 +69,36 @@ bool gpt_model_load(const std::string& fname, gpt_model & model) {
printf("%s: n_wtes = %d\n", __func__, hparams.n_wtes);
}

// TODO: load vocab
{
if (has_vocab) {
int32_t n_vocab;
read_safe(fin, n_vocab);

// 5 special tokens: [UNK, SEP, MASK, PAD, CLS]
if (n_vocab != model.hparams.n_in_vocab - model.hparams.n_out_vocab - 5) {
fprintf(stderr, "%s: wrong voculary size (%d != %d)\n", __func__, n_vocab, model.hparams.n_in_vocab);
return false;
}

std::string word;
std::vector<char> tmp;

tmp.reserve(128);

for (int i = 0; i < n_vocab; i++) {
uint32_t len;
read_safe(fin, len);

if (len > 0) {
tmp.resize(len);
fin.read(&tmp[0], tmp.size()); // read to buffer
word.assign(&tmp[0], tmp.size());
} else {
word = "";
}

vocab.token_to_id[word] = i;
vocab.id_to_token[i] = word;
}
}

// for the big tensors, we have the option to store the data in 16-bit floats or quantized
Expand Down Expand Up @@ -301,7 +347,7 @@ bool bark_model_load(std::string & dirname, bark_model & model) {
{
printf("%s: reading bark text model\n", __func__);
const std::string fname = dirname + "/ggml_weights_text.bin";
if(!gpt_model_load(fname, model.text_model)) {
if(!gpt_model_load(fname, model.text_model, model.vocab, true)) {
fprintf(stderr, "%s: invalid model file '%s' (bad text)\n", __func__, fname.c_str());
return false;
}
Expand All @@ -312,7 +358,7 @@ bool bark_model_load(std::string & dirname, bark_model & model) {
{
printf("\n%s: reading bark coarse model\n", __func__);
const std::string fname = dirname + "/ggml_weights_coarse.bin";
if(!gpt_model_load(fname, model.coarse_model)) {
if(!gpt_model_load(fname, model.coarse_model, model.vocab, false)) {
fprintf(stderr, "%s: invalid model file '%s' (bad coarse)\n", __func__, fname.c_str());
return false;
}
Expand All @@ -323,7 +369,7 @@ bool bark_model_load(std::string & dirname, bark_model & model) {
{
printf("\n%s: reading bark fine model\n", __func__);
const std::string fname = dirname + "/ggml_weights_fine.bin";
if(!gpt_model_load(fname, model.fine_model)) {
if(!gpt_model_load(fname, model.fine_model, model.vocab, false)) {
fprintf(stderr, "%s: invalid model file '%s' (bad fine)\n", __func__, fname.c_str());
return false;
}
Expand All @@ -346,6 +392,99 @@ bool bark_model_load(std::string & dirname, bark_model & model) {
return true;
}

std::string bert_normalize_prompt(const std::string &text) {
std::string text2 = strip_accents(text);
for (size_t i = 0; i < text2.size(); i += utf8_len(text2[i])) {
char c = text2[i];
if (c >= 'A' && c <= 'Z')
text2[i] = c - 'A' + 'a';
}
return text2;
}

void bert_tokenize(
const bark_vocab& vocab,
const char * text,
int32_t * tokens,
int32_t * n_tokens,
int32_t n_max_tokens) {
std::string str = text;
std::vector<std::string> words;

// first split the text into words
{
str = bert_normalize_prompt(str);

std::string pat = R"([[:punct:]]|[[:alpha:]]+|[[:digit:]]+)";

std::regex re(pat);
std::smatch m;

while (std::regex_search(str, m, re)) {
for (std::string x : m)
words.push_back(x);
str = m.suffix();
}
}

int32_t t = 0;
tokens[t++] = CLS_TOKEN_ID;

// find the longest tokens that form the words:
for (const auto &word : words) {
if (word.size() == 0)
continue;

int i = 0;
int n = word.size();
auto *token_map = &vocab.token_to_id;
loop:
while (i < n) {
if (t >= n_max_tokens - 1)
break;

int j = n;
while (j > i) {
auto it = token_map->find(word.substr(i, j - i));
if (it != token_map->end()) {
tokens[t++] = it->second;
i = j;
token_map = &vocab.subword_token_to_id;
goto loop;
}
--j;
}
if (j == i) {
fprintf(stderr, "%s: unknown token '%s'\n", __func__, word.substr(i, 1).data());
token_map = &vocab.subword_token_to_id;
++i;
}
}
}
tokens[t++] = SEP_TOKEN_ID;
*n_tokens = t;
}

void bark_generate_audio(bark_model model, const bark_vocab& vocab, const char * text) {
// tokenize text (bert tokenizer)
int32_t max_ctx_size = model.text_model.hparams.block_size;
int32_t n_tokens;

std::vector<bark_vocab::id> tokens;
tokens.resize(max_ctx_size);
bert_tokenize(vocab, text, tokens.data(), &n_tokens, max_ctx_size);

printf("\ntokens: ");
for (int i = 0; i < n_tokens; i++) {
printf("%d ", tokens[i]);
}
printf("\n");

// encode text

// generate audio (encodec)
}

int main(int argc, char **argv) {
const int64_t t_main_start_us = ggml_time_us();

Expand All @@ -372,6 +511,9 @@ int main(int argc, char **argv) {
{
const int64_t t_eval_us_start = ggml_time_us();

// call to generate audio
bark_generate_audio(model, model.vocab, prompt.data());

t_eval_us = ggml_time_us() - t_eval_us_start;
}

Expand Down
15 changes: 12 additions & 3 deletions bark.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
#include <map>
#include <vector>

#define CLS_TOKEN_ID 101
#define SEP_TOKEN_ID 102

struct gpt_hparams {
int32_t n_in_vocab;
int32_t n_out_vocab;
Expand All @@ -14,15 +17,18 @@ struct gpt_hparams {
int32_t n_wtes;
};

struct gpt_vocab {
struct bark_vocab {
using id = int32_t;
using token = std::string;

std::map<token, id> token_to_id;
std::map<id, token> id_to_token;
std::vector<std::string> special_tokens;

void add_special_token(const std::string & token);
std::map<token, id> subword_token_to_id;
std::map<id, token> id_to_subword_token;

// std::vector<std::string> special_tokens;
// void add_special_token(const std::string & token);
};

struct gpt_layer {
Expand Down Expand Up @@ -85,5 +91,8 @@ struct bark_model {
// decoder
encodec_model codec_model;

// vocab
bark_vocab vocab;

int32_t memsize = 0;
};
33 changes: 19 additions & 14 deletions convert_pt_to_ggml.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,21 +26,23 @@
python convert_pt_to_ggml.py \
--dir-model ~/.cache/suno/bark_v0 \
--codec-path ~/Documents/encodec.cpp/ggml_weights \
--vocab-path ./ggml_weights/ \
--out-dir ./ggml_weights/
```
"""
import argparse
from pathlib import Path
import json
import re
import struct

import numpy as np
import torch


parser = argparse.ArgumentParser()
parser.add_argument("--dir-model", type=str, required=True)
parser.add_argument("--codec-path", type=str, required=True)
parser.add_argument("--vocab-path", type=str, required=True)
parser.add_argument("--out-dir", type=str, required=True)


Expand Down Expand Up @@ -103,18 +105,18 @@ def parse_codec_model(checkpoint, out_dir):
outfile.close()

def parse_vocab(dir_model, outfile):
"""Parse GPT vocabulary."""
with open(dir_model / "vocab.json", "r", encoding="utf-8") as infile:
vocab = json.load(infile)
"""Parse vocabulary."""
# Even if bark relies on GPT to encode text, it relies on BertTokenizer (WordPiece)
with open(dir_model / "vocab.txt", "r", encoding="utf-8") as f:
vocab = f.readlines()

tokens = sorted(vocab.items(), key=lambda x: x[1])
outfile.write(struct.pack("i", len(tokens)))
print("Vocab size:", len(tokens))
outfile.write(struct.pack("i", len(vocab)))
print("Vocab size:", len(vocab))

for token, _ in tokens:
text = bytearray(token, "utf-8")
outfile.write(struct.pack("i", len(text)))
outfile.write(text)
for token in vocab:
data = bytearray(token[:-1], "utf-8") # strip newline at the end
outfile.write(struct.pack("i", len(data)))
outfile.write(data)

def parse_hparams(hparams, outfile):
"""Parse GPT hyperparameters."""
Expand Down Expand Up @@ -227,12 +229,14 @@ def parse_text_models(checkpoint, outfile):

var_data.tofile(outfile)

def generate_file(in_file, out_dir):
def generate_file(in_file, out_dir, vocab_path=None):
outfile = open(out_dir, "wb")
outfile.write(struct.pack("i", 0x67676d6c)) # ggml magic

checkpoint = torch.load(in_file, map_location="cpu")

parse_hparams(checkpoint["model_args"], outfile)
if vocab_path:
parse_vocab(vocab_path, outfile)
parse_text_models(checkpoint["model"], outfile)

outfile.close()
Expand All @@ -243,11 +247,12 @@ def generate_file(in_file, out_dir):

dir_model = Path(args.dir_model)
codec_path = Path(args.codec_path)
vocab_path = Path(args.vocab_path)

out_dir = Path(args.out_dir)
out_dir.mkdir(exist_ok=True, parents=True)

generate_file(dir_model / "text_2.pt", out_dir / "ggml_weights_text.bin")
generate_file(dir_model / "text_2.pt", out_dir / "ggml_weights_text.bin", vocab_path)
print(" Text model loaded.")

generate_file(dir_model / "coarse_2.pt", out_dir / "ggml_weights_coarse.bin")
Expand Down
35 changes: 35 additions & 0 deletions util.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,38 @@ template<typename T>
static void read_safe(std::ifstream& infile, T& dest) {
infile.read((char*)& dest, sizeof(T));
}

static size_t utf8_len(char src) {
const size_t lookup[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4};
uint8_t highbits = static_cast<uint8_t>(src) >> 4;
return lookup[highbits];
}

std::string strip_accents(const std::string &in_str) {
std::string out_str;
std::map<std::string, char> accent_map = {{"À", 'A'},{"Á", 'A'},
{"Â", 'A'},{"Ã", 'A'},{"Ä", 'A'},{"Å", 'A'},{"à", 'a'},{"á", 'a'},
{"â", 'a'},{"ã", 'a'},{"ä", 'a'},{"å", 'a'},{"È", 'E'},{"É", 'E'},
{"Ê", 'E'},{"Ë", 'E'},{"è", 'e'},{"é", 'e'},{"ê", 'e'},{"ë", 'e'},
{"Ì", 'I'},{"Í", 'I'},{"Î", 'I'},{"Ï", 'I'},{"ì", 'i'},{"í", 'i'},
{"î", 'i'},{"ï", 'i'},{"Ò", 'O'},{"Ó", 'O'},{"Ô", 'O'},{"Õ", 'O'},
{"Ö", 'O'},{"ò", 'o'},{"ó", 'o'},{"ô", 'o'},{"õ", 'o'},{"ö", 'o'},
{"Ù", 'U'},{"Ú", 'U'},{"Û", 'U'},{"Ü", 'U'},{"ù", 'u'},{"ú", 'u'},
{"û", 'u'},{"ü", 'u'},{"Ý", 'Y'},{"ý", 'y'},{"Ç", 'C'},{"ç", 'c'},
{"Ñ", 'N'},{"ñ", 'n'},
};

for (size_t i = 0; i < in_str.length();) {
int len = utf8_len(in_str[i]);
std::string cur = in_str.substr(i, len);
auto iter = accent_map.find(cur);
if (iter != accent_map.end())
out_str += iter->second;
else
out_str += cur;

i += len;
}

return out_str;
}