diff --git a/README.md b/README.md index c54d1f1..de8fd80 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,10 @@ # Wave-U-Net (Pytorch) + Improved version of the [Wave-U-Net](https://arxiv.org/abs/1806.03185) for audio source separation, implemented in Pytorch. Click [here](www.github.com/f90/Wave-U-Net) for the original Wave-U-Net implementation in Tensorflow. -You can find more information about the model and results there as well. +You can find more information about the model and results there as well. # Improvements @@ -24,7 +25,9 @@ GPU strongly recommended to avoid very long training times. System requirements: * Linux-based OS * Python 3.6 + * [libsndfile](http://mega-nerd.com/libsndfile/) + * [ffmpeg](https://www.ffmpeg.org/) * CUDA 10.1 for GPU usage @@ -68,6 +71,7 @@ You can of course use your own datasets for training, but for this you would nee # Training the models To train a Wave-U-Net, the basic command to use is + ``` python3.6 train.py --dataset_dir /PATH/TO/MUSDB18HQ ``` @@ -86,7 +90,7 @@ After training, the model is evaluated on the MUSDB18HQ test set, and SDR/SIR/SA # Test trained models on songs! -We provide the default model in a pre-trained form as download so you can separate your own songs right away. +We provide the default model in a pre-trained form as download so you can separate your own songs right away. ## Downloading our pretrained models diff --git a/cog.yaml b/cog.yaml new file mode 100644 index 0000000..83c6d2b --- /dev/null +++ b/cog.yaml @@ -0,0 +1,20 @@ +build: + python_version: "3.6" + gpu: false + python_packages: + - future==0.18.2 + - numpy==1.19.5 + - librosa==0.8.1 + - soundfile==0.10.3.post1 + - musdb==0.4.0 + - museval==0.4.0 + - h5py==3.1.0 + - tqdm==4.62.1 + - torch==1.4.0 + - torchvision==0.5.0 + - tensorboard==2.6.0 + - sortedcontainers==2.4.0 + system_packages: + - libsndfile-dev + - ffmpeg +predict: "cog_predict.py:waveunetPredictor" diff --git a/cog_predict.py b/cog_predict.py new file mode 100644 index 0000000..483d3b4 --- /dev/null +++ b/cog_predict.py @@ -0,0 +1,144 @@ +import os +import cog +import tempfile +import zipfile +from pathlib import Path +import argparse +import data.utils +import model.utils as model_utils +from test import predict_song +from model.waveunet import Waveunet + + +class waveunetPredictor(cog.Predictor): + def setup(self): + """Init wave u net model""" + parser = argparse.ArgumentParser() + parser.add_argument( + "--instruments", + type=str, + nargs="+", + default=["bass", "drums", "other", "vocals"], + help='List of instruments to separate (default: "bass drums other vocals")', + ) + parser.add_argument( + "--cuda", action="store_true", help="Use CUDA (default: False)" + ) + parser.add_argument( + "--features", + type=int, + default=32, + help="Number of feature channels per layer", + ) + parser.add_argument( + "--load_model", + type=str, + default="checkpoints/waveunet/model", + help="Reload a previously trained model", + ) + parser.add_argument("--batch_size", type=int, default=4, help="Batch size") + parser.add_argument( + "--levels", type=int, default=6, help="Number of DS/US blocks" + ) + parser.add_argument( + "--depth", type=int, default=1, help="Number of convs per block" + ) + parser.add_argument("--sr", type=int, default=44100, help="Sampling rate") + parser.add_argument( + "--channels", type=int, default=2, help="Number of input audio channels" + ) + parser.add_argument( + "--kernel_size", + type=int, + default=5, + help="Filter width of kernels. Has to be an odd number", + ) + parser.add_argument( + "--output_size", type=float, default=2.0, help="Output duration" + ) + parser.add_argument( + "--strides", type=int, default=4, help="Strides in Waveunet" + ) + parser.add_argument( + "--conv_type", + type=str, + default="gn", + help="Type of convolution (normal, BN-normalised, GN-normalised): normal/bn/gn", + ) + parser.add_argument( + "--res", + type=str, + default="fixed", + help="Resampling strategy: fixed sinc-based lowpass filtering or learned conv layer: fixed/learned", + ) + parser.add_argument( + "--separate", + type=int, + default=1, + help="Train separate model for each source (1) or only one (0)", + ) + parser.add_argument( + "--feature_growth", + type=str, + default="double", + help="How the features in each layer should grow, either (add) the initial number of features each time, or multiply by 2 (double)", + ) + """ + parser.add_argument('--input', type=str, default=str(input), + help="Path to input mixture to be separated") + parser.add_argument('--output', type=str, default=out_path, help="Output path (same folder as input path if not set)") + """ + args = parser.parse_args([]) + self.args = args + + num_features = ( + [args.features * i for i in range(1, args.levels + 1)] + if args.feature_growth == "add" + else [args.features * 2 ** i for i in range(0, args.levels)] + ) + target_outputs = int(args.output_size * args.sr) + self.model = Waveunet( + args.channels, + num_features, + args.channels, + args.instruments, + kernel_size=args.kernel_size, + target_output_size=target_outputs, + depth=args.depth, + strides=args.strides, + conv_type=args.conv_type, + res=args.res, + separate=args.separate, + ) + + if args.cuda: + self.model = model_utils.DataParallel(model) + print("move model to gpu") + self.model.cuda() + + print("Loading model from checkpoint " + str(args.load_model)) + state = model_utils.load_model(self.model, None, args.load_model, args.cuda) + print("Step", state["step"]) + + @cog.input("input", type=Path, help="audio mixture path") + def predict(self, input): + """Separate tracks from input mixture audio""" + + out_path = Path(tempfile.mkdtemp()) + zip_path = Path(tempfile.mkdtemp()) / "output.zip" + + preds = predict_song(self.args, input, self.model) + + out_names = [] + for inst in preds.keys(): + temp_n = os.path.join( + str(out_path), os.path.basename(str(input)) + "_" + inst + ".wav" + ) + data.utils.write_wav(temp_n, preds[inst], self.args.sr) + out_names.append(temp_n) + + with zipfile.ZipFile(str(zip_path), "w") as zf: + for i in out_names: + zf.write(str(i)) + + return zip_path