From ab27dcb9cdd9d47e0b2516a6ae700bb29d49c16e Mon Sep 17 00:00:00 2001 From: ericguizzo Date: Wed, 29 Sep 2021 17:43:58 +0200 Subject: [PATCH] cog-ified --- README.md | 113 +++++++++++++++++++++++++++++++++++++- cog.yaml | 20 +++++++ cog_predict.py | 144 +++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 275 insertions(+), 2 deletions(-) create mode 100644 cog.yaml create mode 100644 cog_predict.py diff --git a/README.md b/README.md index 6cb3c62..ef50c6f 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,111 @@ -# Wave-U-Net-Pytorch -Improved Wave-U-Net implemented in Pytorch +# Wave-U-Net (Pytorch) + + +Improved version of the [Wave-U-Net](https://arxiv.org/abs/1806.03185) for audio source separation, implemented in Pytorch. + +Click [here](www.github.com/f90/Wave-U-Net) for the original Wave-U-Net implementation in Tensorflow. +You can find more information about the model and results there as well. + +# Improvements + +* Multi-instrument separation by default, using a separate standard Wave-U-Net for each source (can be set to one model as well) +* More scalable to larger data: A depth parameter D can be set that employs D convolutions for each single convolution in the original Wave-U-Net +* More configurable: Layer type, resampling factor at each level etc. can be easily changed (different normalization, residual connections...) +* Fast training: Preprocesses the given dataset by saving the audio into HDF files, which can be read very quickly during training, thereby avoiding slowdown due to resampling and decoding +* Modular thanks to Pytorch: Easily replace components of the model with your own variants/layers/losses +* Better output handling: Separate output convolution for each source estimate with linear activation so amplitudes near 1 and -1 can be easily predicted, at test time thresholding to valid amplitude range [-1,1] +* Fixed or dynamic resampling: Either use fixed lowpass filter to avoid aliasing during resampling, or use a learnable convolution + +# Installation + +GPU strongly recommended to avoid very long training times. + +### Option 1: Direct install (recommended) + +System requirements: +* Linux-based OS +* Python 3.6 +* [libsndfile](http://mega-nerd.com/libsndfile/) +* [ffmpeg](https://www.ffmpeg.org/) +* CUDA 10.1 for GPU usage + +Clone the repository: +``` +git clone https://github.com/f90/Wave-U-Net-Pytorch.git +``` + +Recommended: Create a new virtual environment to install the required Python packages into, then activate the virtual environment: + +``` +virtualenv --python /usr/bin/python3.6 waveunet-env +source waveunet-env/bin/activate +``` + +Install all the required packages listed in the ``requirements.txt``: + +``` +pip3 install -r requirements.txt +``` + +### Option 2: Singularity + +We also provide a Singularity container which allows you to avoid installing the correct Python, CUDA and other system libraries, however we don't provide specific advice on how to run the container and so only do this if you have to or know what you are doing (since you need to mount dataset paths to the container etc.) + +To pull the container, run +``` +singularity pull shub://f90/Wave-U-Net-Pytorch +``` + +Then run the container from the directory where you cloned this repository to, using the commands listed further below in this readme. + +# Download datasets + +To directly use the pre-trained models we provide for download to separate your own songs, now skip directly to the [last section](#test), since the datasets are not needed in that case. + +To start training your own models, download the [full MUSDB18HQ dataset](https://sigsep.github.io/datasets/musdb.html) and extract it into a folder of your choice. It should have two subfolders: "test" and "train" as well as a README.md file. + +You can of course use your own datasets for training, but for this you would need to modify the code manually, which will not be discussed here. However, we provide a loading function for the normal MUSDB18 dataset as well. + +# Training the models + +To train a Wave-U-Net, the basic command to use is +``` +python3.6 train.py --dataset_dir /PATH/TO/MUSDB18HQ +``` +where the path to MUSDB18HQ dataset needs to be specified, which contains the ``train`` and ``test`` subfolders. + +Add more command line parameters as needed: +* ``--cuda`` to activate GPU usage +* ``--hdf_dir PATH`` to save the preprocessed data (HDF files) to custom location PATH, instead of the default ``hdf`` subfolder in this repository +* ``--checkpoint_dir`` and ``--log_dir`` to specify where checkpoint files and logs are saved/loaded +* ``--load_model checkpoints/model_name/checkpoint_X`` to start training with weights given by a certain checkpoint + +For more config options, see ``train.py``. + +Training progress can be monitored by using Tensorboard on the respective ``log_dir``. +After training, the model is evaluated on the MUSDB18HQ test set, and SDR/SIR/SAR metrics are reported for all instruments and written into both the Tensorboard, and in more detail also into a ``results.pkl`` file in the ``checkpoint_dir`` + +# Test trained models on songs! + +We provide the default model in a pre-trained form as download so you can separate your own songs right away. + +## Downloading our pretrained models + +Download our pretrained model [here](https://www.dropbox.com/s/r374hce896g4xlj/models.7z?dl=1). +Extract the archive into the ``checkpoints`` subfolder in this repository, so that you have one subfolder for each model (e.g. ``REPO/checkpoints/waveunet``) + +## Run pretrained model + +To apply our pretrained model to any of your own songs, simply point to its audio file path using the ``input_path`` parameter: + +``` +python3.6 predict.py --load_model checkpoints/waveunet/model --input "audio_examples/Cristina Vane - So Easy/mix.mp3" +``` + +* Add ``--cuda `` when using a GPU, it should be much quicker +* Point ``--input`` to the music file you want to separate + +By default, output is written where the input music file is located, using the original file name plus the instrument name as output file name. Use ``--output`` to customise the output directory. + +To run your own model: +* Point ``--load_model`` to the checkpoint file of the model you are using. If you used non-default hyper-parameters to train your own model, you must specify them here again so the correct model is set up and can receive the weights! diff --git a/cog.yaml b/cog.yaml new file mode 100644 index 0000000..83c6d2b --- /dev/null +++ b/cog.yaml @@ -0,0 +1,20 @@ +build: + python_version: "3.6" + gpu: false + python_packages: + - future==0.18.2 + - numpy==1.19.5 + - librosa==0.8.1 + - soundfile==0.10.3.post1 + - musdb==0.4.0 + - museval==0.4.0 + - h5py==3.1.0 + - tqdm==4.62.1 + - torch==1.4.0 + - torchvision==0.5.0 + - tensorboard==2.6.0 + - sortedcontainers==2.4.0 + system_packages: + - libsndfile-dev + - ffmpeg +predict: "cog_predict.py:waveunetPredictor" diff --git a/cog_predict.py b/cog_predict.py new file mode 100644 index 0000000..483d3b4 --- /dev/null +++ b/cog_predict.py @@ -0,0 +1,144 @@ +import os +import cog +import tempfile +import zipfile +from pathlib import Path +import argparse +import data.utils +import model.utils as model_utils +from test import predict_song +from model.waveunet import Waveunet + + +class waveunetPredictor(cog.Predictor): + def setup(self): + """Init wave u net model""" + parser = argparse.ArgumentParser() + parser.add_argument( + "--instruments", + type=str, + nargs="+", + default=["bass", "drums", "other", "vocals"], + help='List of instruments to separate (default: "bass drums other vocals")', + ) + parser.add_argument( + "--cuda", action="store_true", help="Use CUDA (default: False)" + ) + parser.add_argument( + "--features", + type=int, + default=32, + help="Number of feature channels per layer", + ) + parser.add_argument( + "--load_model", + type=str, + default="checkpoints/waveunet/model", + help="Reload a previously trained model", + ) + parser.add_argument("--batch_size", type=int, default=4, help="Batch size") + parser.add_argument( + "--levels", type=int, default=6, help="Number of DS/US blocks" + ) + parser.add_argument( + "--depth", type=int, default=1, help="Number of convs per block" + ) + parser.add_argument("--sr", type=int, default=44100, help="Sampling rate") + parser.add_argument( + "--channels", type=int, default=2, help="Number of input audio channels" + ) + parser.add_argument( + "--kernel_size", + type=int, + default=5, + help="Filter width of kernels. Has to be an odd number", + ) + parser.add_argument( + "--output_size", type=float, default=2.0, help="Output duration" + ) + parser.add_argument( + "--strides", type=int, default=4, help="Strides in Waveunet" + ) + parser.add_argument( + "--conv_type", + type=str, + default="gn", + help="Type of convolution (normal, BN-normalised, GN-normalised): normal/bn/gn", + ) + parser.add_argument( + "--res", + type=str, + default="fixed", + help="Resampling strategy: fixed sinc-based lowpass filtering or learned conv layer: fixed/learned", + ) + parser.add_argument( + "--separate", + type=int, + default=1, + help="Train separate model for each source (1) or only one (0)", + ) + parser.add_argument( + "--feature_growth", + type=str, + default="double", + help="How the features in each layer should grow, either (add) the initial number of features each time, or multiply by 2 (double)", + ) + """ + parser.add_argument('--input', type=str, default=str(input), + help="Path to input mixture to be separated") + parser.add_argument('--output', type=str, default=out_path, help="Output path (same folder as input path if not set)") + """ + args = parser.parse_args([]) + self.args = args + + num_features = ( + [args.features * i for i in range(1, args.levels + 1)] + if args.feature_growth == "add" + else [args.features * 2 ** i for i in range(0, args.levels)] + ) + target_outputs = int(args.output_size * args.sr) + self.model = Waveunet( + args.channels, + num_features, + args.channels, + args.instruments, + kernel_size=args.kernel_size, + target_output_size=target_outputs, + depth=args.depth, + strides=args.strides, + conv_type=args.conv_type, + res=args.res, + separate=args.separate, + ) + + if args.cuda: + self.model = model_utils.DataParallel(model) + print("move model to gpu") + self.model.cuda() + + print("Loading model from checkpoint " + str(args.load_model)) + state = model_utils.load_model(self.model, None, args.load_model, args.cuda) + print("Step", state["step"]) + + @cog.input("input", type=Path, help="audio mixture path") + def predict(self, input): + """Separate tracks from input mixture audio""" + + out_path = Path(tempfile.mkdtemp()) + zip_path = Path(tempfile.mkdtemp()) / "output.zip" + + preds = predict_song(self.args, input, self.model) + + out_names = [] + for inst in preds.keys(): + temp_n = os.path.join( + str(out_path), os.path.basename(str(input)) + "_" + inst + ".wav" + ) + data.utils.write_wav(temp_n, preds[inst], self.args.sr) + out_names.append(temp_n) + + with zipfile.ZipFile(str(zip_path), "w") as zf: + for i in out_names: + zf.write(str(i)) + + return zip_path