Skip to content

Commit 04d9ced

Browse files
authored
Merge pull request #1 from audiohacking/audio-reference
Reference audio (WAV/MP3), CI build + test-generation
2 parents 9493b5c + c6640c0 commit 04d9ced

18 files changed

Lines changed: 2653 additions & 22 deletions

.github/workflows/build.yml

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
# Validate that the project builds on Ubuntu and macOS (no model download).
2+
name: Build
3+
4+
on:
5+
push:
6+
branches: [main, master]
7+
pull_request:
8+
branches: [main, master]
9+
10+
jobs:
11+
build:
12+
runs-on: ${{ matrix.os }}
13+
strategy:
14+
fail-fast: false
15+
matrix:
16+
os: [ubuntu-latest, macos-latest]
17+
18+
steps:
19+
- name: Checkout
20+
uses: actions/checkout@v4
21+
with:
22+
submodules: recursive
23+
24+
- name: Build (Ubuntu)
25+
if: matrix.os == 'ubuntu-latest'
26+
run: |
27+
sudo apt-get update -qq
28+
sudo apt-get install -y -qq cmake build-essential pkg-config libopenblas-dev
29+
mkdir build && cd build
30+
cmake .. -DGGML_BLAS=ON
31+
cmake --build . --config Release -j$(nproc)
32+
33+
- name: Build (macOS)
34+
if: matrix.os == 'macos-latest'
35+
run: |
36+
mkdir build && cd build
37+
cmake ..
38+
cmake --build . --config Release -j$(sysctl -n hw.ncpu)
39+
40+
- name: Smoke test
41+
run: |
42+
./build/ace-qwen3 --help 2>&1 | head -5
43+
./build/dit-vae --help 2>&1 | head -5
44+
./build/quantize --help 2>&1 | head -3
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
# Build, download models (cached), and run short generation tests for various modes.
2+
# Runs on release (published) or manual trigger only. Uses short duration (5s) and few steps (4).
3+
name: Test generation
4+
5+
on:
6+
workflow_dispatch: {}
7+
release:
8+
types: [published]
9+
10+
jobs:
11+
build-and-test:
12+
runs-on: ubuntu-latest
13+
timeout-minutes: 60
14+
15+
steps:
16+
- name: Checkout
17+
uses: actions/checkout@v4
18+
with:
19+
submodules: recursive
20+
21+
- name: Build
22+
run: |
23+
sudo apt-get update -qq
24+
sudo apt-get install -y -qq cmake build-essential pkg-config libopenblas-dev
25+
mkdir build && cd build
26+
cmake .. -DGGML_BLAS=ON
27+
cmake --build . --config Release -j$(nproc)
28+
29+
- name: Cache models
30+
id: cache-models
31+
uses: actions/cache@v4
32+
with:
33+
path: models
34+
key: acestep-models-q8-${{ hashFiles('models.sh') }}
35+
restore-keys: acestep-models-q8-
36+
37+
- name: Download models
38+
if: steps.cache-models.outputs.cache-hit != 'true'
39+
run: |
40+
pip install -q hf
41+
./models.sh
42+
43+
- name: Test mode text2music (short)
44+
run: |
45+
./build/dit-vae \
46+
--request tests/fixtures/ci-text2music.json \
47+
--text-encoder models/Qwen3-Embedding-0.6B-Q8_0.gguf \
48+
--dit models/acestep-v15-turbo-Q8_0.gguf \
49+
--vae models/vae-BF16.gguf
50+
test -f tests/fixtures/ci-text2music0.wav && echo "text2music WAV OK"
51+
52+
- name: Test mode cover with WAV reference (short)
53+
run: |
54+
./build/dit-vae \
55+
--request tests/fixtures/ci-cover.json \
56+
--text-encoder models/Qwen3-Embedding-0.6B-Q8_0.gguf \
57+
--dit models/acestep-v15-turbo-Q8_0.gguf \
58+
--vae models/vae-BF16.gguf
59+
test -f tests/fixtures/ci-cover0.wav && echo "cover WAV OK"
60+
61+
- name: Test full pipeline (LLM + DiT, short)
62+
run: |
63+
./build/ace-qwen3 \
64+
--request tests/fixtures/ci-text2music.json \
65+
--model models/acestep-5Hz-lm-4B-Q8_0.gguf
66+
test -f request0.json
67+
./build/dit-vae \
68+
--request request0.json \
69+
--text-encoder models/Qwen3-Embedding-0.6B-Q8_0.gguf \
70+
--dit models/acestep-v15-turbo-Q8_0.gguf \
71+
--vae models/vae-BF16.gguf
72+
test -f request00.wav && echo "full pipeline WAV OK"

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@ build/
33
*.bf16
44

55
tests/*/
6+
!tests/fixtures/
7+
!tests/fixtures/*.json
68

79
checkpoints/
810
models/

CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ macro(link_ggml_backends target)
4646
endmacro()
4747

4848
# dit-vae: full pipeline (text-enc + cond + dit + vae + wav)
49-
add_executable(dit-vae dit-vae.cpp request.cpp)
49+
add_executable(dit-vae dit-vae.cpp request.cpp audio_loader.cpp)
5050
link_ggml_backends(dit-vae)
5151

5252
# ace-qwen3: LLM inference (CoT + audio codes)

README.md

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,10 @@ cmake --build . --config Release -j$(nproc)
3131

3232
Builds two binaries: `ace-qwen3` (LLM) and `dit-vae` (DiT + VAE).
3333

34+
**CI (GitHub Actions)**
35+
- **Build**: on every push/PR, builds on Ubuntu (BLAS) and macOS (Metal); smoke test runs each binary `--help`.
36+
- **Test generation**: on manual trigger or push to `main`; builds, caches models, then runs short (5 s, 4 steps) generation for text2music, cover, and full pipeline (LLM → DiT → WAV). See `.github/workflows/`.
37+
3438
## Models
3539

3640
Pre-quantized GGUFs on [Hugging Face](https://huggingface.co/Serveurperso/ACE-Step-1.5-GGUF).
@@ -139,10 +143,11 @@ cd examples
139143
./partial.sh # caption + lyrics + duration
140144
./full.sh # all metadata provided
141145
./dit-only.sh # skip LLM, DiT from noise
146+
./test-reference.sh # reference_audio (WAV or MP3) + audio_cover_strength
142147
```
143148

144149
Each example has a `-sft` variant (SFT model, 50 steps, CFG 7.0)
145-
alongside the turbo default (8 steps, no CFG).
150+
alongside the turbo default (8 steps, no CFG). For **reference timbre**, set `reference_audio` to a **WAV or MP3** path; dit-vae loads it (MP3 decoded in memory via header-only minimp3, no temp files), encodes with the VAE encoder (requires a full VAE GGUF that includes encoder weights).
146151

147152
## Generation modes
148153

@@ -170,10 +175,11 @@ Run `dit-vae` to decode existing codes. See `examples/dit-only.json`.
170175

171176
## Request JSON reference
172177

173-
All fields with defaults. Only `caption` is required.
178+
All fields with defaults. Only `caption` is required. Built-in modes (text2music, cover, repaint) and audio inputs follow the [ACE-Step 1.5 Tutorial](https://github.com/ace-step/ACE-Step-1.5/blob/main/docs/en/Tutorial.md); see [docs/MODES.md](docs/MODES.md) for what is implemented.
174179

175180
```json
176181
{
182+
"task_type": "text2music",
177183
"caption": "",
178184
"lyrics": "",
179185
"instrumental": false,
@@ -188,7 +194,12 @@ All fields with defaults. Only `caption` is required.
188194
"lm_top_p": 0.9,
189195
"lm_top_k": 0,
190196
"lm_negative_prompt": "",
197+
"reference_audio": "",
198+
"src_audio": "",
191199
"audio_codes": "",
200+
"audio_cover_strength": 1.0,
201+
"repainting_start": 0.0,
202+
"repainting_end": 0.0,
192203
"inference_steps": 8,
193204
"guidance_scale": 7.0,
194205
"shift": 3.0
@@ -198,7 +209,7 @@ All fields with defaults. Only `caption` is required.
198209
Key fields: `seed` -1 means random (resolved once, then +1 per batch
199210
element). `audio_codes` is generated by ace-qwen3 and consumed by
200211
dit-vae (comma separated FSQ token IDs). When present, the LLM is
201-
skipped entirely.
212+
skipped entirely (cover-style generation). `reference_audio`: path to a **WAV or MP3** file for global timbre/style (MP3 decoded in memory; encoded via built-in VAE encoder; requires VAE GGUF with encoder weights). `src_audio`: not yet implemented (see docs/MODES.md).
202213

203214
Turbo preset: `inference_steps=8, shift=3.0` (no guidance_scale, turbo models don't use CFG).
204215
SFT preset: `inference_steps=50, guidance_scale=4.0, shift=6.0`.

audio.h

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
// audio.h: unified reference-audio loader (WAV + MP3 → stereo 48kHz float)
2+
// Header-only for WAV; MP3 implementation in audio_loader.cpp (minimp3, no temp files).
3+
4+
#pragma once
5+
6+
#include <cstddef>
7+
#include <string>
8+
#include <vector>
9+
10+
// Load WAV or MP3 file into stereo float32 at 48kHz.
11+
// Out: interleaved L,R,L,R,...; length = num_samples (per channel).
12+
// Returns num_samples (per channel), or -1 on error.
13+
// No temp files; MP3 decoded in memory via minimp3 (header-only dep).
14+
int load_audio_48k_stereo(const char * path, std::vector<float> * out);
15+
16+
// MP3 implementation (in audio_loader.cpp; do not call from other TUs without linking it)
17+
int mp3_load_48k_stereo(const char * path, std::vector<float> * out);

audio_loader.cpp

Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
// audio_loader.cpp: MP3 decode for reference audio (minimp3, no deps, no temp files)
2+
3+
#define MINIMP3_IMPLEMENTATION
4+
#include "third_party/minimp3.h"
5+
6+
#include "wav.h"
7+
#include "audio.h"
8+
#include <cstdio>
9+
#include <cstring>
10+
#include <vector>
11+
#include <algorithm>
12+
13+
static bool path_ends_with_ci(const char * path, const char * suffix) {
14+
size_t pl = strlen(path), sl = strlen(suffix);
15+
if (pl < sl) return false;
16+
const char * p = path + pl - sl;
17+
for (size_t i = 0; i < sl; i++) {
18+
char a = (char)(p[i] >= 'A' && p[i] <= 'Z' ? p[i] + 32 : p[i]);
19+
char b = (char)(suffix[i] >= 'A' && suffix[i] <= 'Z' ? suffix[i] + 32 : suffix[i]);
20+
if (a != b) return false;
21+
}
22+
return true;
23+
}
24+
25+
static void pcm_to_float_stereo_48k(
26+
const int16_t * pcm, size_t num_samples, int channels, unsigned int sample_rate,
27+
std::vector<float> * out)
28+
{
29+
const float scale = 1.0f / 32768.0f;
30+
out->resize(num_samples * 2);
31+
if (channels == 1) {
32+
for (size_t i = 0; i < num_samples; i++) {
33+
float s = (float)pcm[i] * scale;
34+
(*out)[i * 2] = s;
35+
(*out)[i * 2 + 1] = s;
36+
}
37+
} else {
38+
for (size_t i = 0; i < num_samples * 2; i++)
39+
(*out)[i] = (float)pcm[i] * scale;
40+
}
41+
42+
if (sample_rate != 48000) {
43+
size_t in_len = num_samples;
44+
size_t out_len = (size_t)((double)in_len * 48000.0 / (double)sample_rate);
45+
std::vector<float> resampled(out_len * 2);
46+
for (size_t i = 0; i < out_len; i++) {
47+
double t = (double)i * (double)in_len / (double)out_len;
48+
size_t i0 = (size_t)t;
49+
size_t i1 = std::min(i0 + 1, in_len - 1);
50+
float w = (float)(t - (double)i0);
51+
for (int c = 0; c < 2; c++)
52+
resampled[i * 2 + c] = (*out)[i0 * 2 + c] * (1.0f - w) + (*out)[i1 * 2 + c] * w;
53+
}
54+
*out = std::move(resampled);
55+
}
56+
}
57+
58+
int mp3_load_48k_stereo(const char * path, std::vector<float> * out) {
59+
FILE * f = fopen(path, "rb");
60+
if (!f) return -1;
61+
fseek(f, 0, SEEK_END);
62+
long sz = ftell(f);
63+
fseek(f, 0, SEEK_SET);
64+
if (sz <= 0 || sz > 200 * 1024 * 1024) {
65+
fclose(f);
66+
return -1;
67+
}
68+
std::vector<uint8_t> buf((size_t)sz);
69+
if (fread(buf.data(), 1, (size_t)sz, f) != (size_t)sz) {
70+
fclose(f);
71+
return -1;
72+
}
73+
fclose(f);
74+
75+
mp3dec_t dec;
76+
mp3dec_init(&dec);
77+
mp3dec_frame_info_t info;
78+
std::vector<int16_t> pcm;
79+
const uint8_t * read_pos = buf.data();
80+
int remaining = (int)buf.size();
81+
int first_hz = 0, first_ch = 0;
82+
const size_t max_samples = (size_t)(60 * 48000 * 2);
83+
84+
while (remaining > 0) {
85+
size_t old_size = pcm.size();
86+
if (old_size + (size_t)MINIMP3_MAX_SAMPLES_PER_FRAME > max_samples) break;
87+
pcm.resize(old_size + (size_t)MINIMP3_MAX_SAMPLES_PER_FRAME);
88+
int frame_samples = mp3dec_decode_frame(&dec, read_pos, remaining, pcm.data() + old_size, &info);
89+
if (frame_samples <= 0) {
90+
pcm.resize(old_size);
91+
read_pos++;
92+
remaining--;
93+
continue;
94+
}
95+
if (first_hz == 0) {
96+
first_hz = info.hz;
97+
first_ch = info.channels;
98+
}
99+
pcm.resize(old_size + (size_t)(frame_samples * info.channels));
100+
read_pos += info.frame_bytes;
101+
remaining -= info.frame_bytes;
102+
}
103+
104+
if (pcm.empty() || first_hz == 0) return -1;
105+
size_t num_samples = pcm.size() / (size_t)first_ch;
106+
pcm_to_float_stereo_48k(pcm.data(), num_samples, first_ch, (unsigned)first_hz, out);
107+
return (int)(out->size() / 2);
108+
}
109+
110+
int load_audio_48k_stereo(const char * path, std::vector<float> * out) {
111+
if (!path || !out) return -1;
112+
if (path_ends_with_ci(path, ".mp3"))
113+
return mp3_load_48k_stereo(path, out);
114+
if (path_ends_with_ci(path, ".wav"))
115+
return wav_load_48k_stereo(path, out);
116+
return -1;
117+
}

0 commit comments

Comments
 (0)