Spaces:
Running
Running
Upload 80 files
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +24 -0
- JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/JEN1-Composer.jpg +0 -0
- JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/Jen1-Composer-2.png +0 -0
- JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/README.md +99 -0
- JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/__pycache__/trainer.cpython-311.pyc +0 -0
- JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/audios/audio1.mp3 +0 -0
- JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/audios/audio2.mp3 +0 -0
- JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/audios/audio3.mp3 +0 -0
- JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/audios/audio4.mp3 +0 -0
- JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/audios/audio5.mp3 +0 -0
- JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/audios/audio6.mp3 +0 -0
- JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/audios/htdemucs/audio1/bass.wav +3 -0
- JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/audios/htdemucs/audio1/drums.wav +3 -0
- JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/audios/htdemucs/audio1/other.wav +3 -0
- JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/audios/htdemucs/audio1/vocals.wav +3 -0
- JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/audios/htdemucs/audio2/bass.wav +3 -0
- JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/audios/htdemucs/audio2/drums.wav +3 -0
- JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/audios/htdemucs/audio2/other.wav +3 -0
- JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/audios/htdemucs/audio2/vocals.wav +3 -0
- JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/audios/htdemucs/audio3/bass.wav +3 -0
- JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/audios/htdemucs/audio3/drums.wav +3 -0
- JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/audios/htdemucs/audio3/other.wav +3 -0
- JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/audios/htdemucs/audio3/vocals.wav +3 -0
- JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/audios/htdemucs/audio4/bass.wav +3 -0
- JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/audios/htdemucs/audio4/drums.wav +3 -0
- JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/audios/htdemucs/audio4/other.wav +3 -0
- JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/audios/htdemucs/audio4/vocals.wav +3 -0
- JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/audios/htdemucs/audio5/bass.wav +3 -0
- JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/audios/htdemucs/audio5/drums.wav +3 -0
- JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/audios/htdemucs/audio5/other.wav +3 -0
- JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/audios/htdemucs/audio5/vocals.wav +3 -0
- JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/audios/htdemucs/audio6/bass.wav +3 -0
- JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/audios/htdemucs/audio6/drums.wav +3 -0
- JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/audios/htdemucs/audio6/other.wav +3 -0
- JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/audios/htdemucs/audio6/vocals.wav +3 -0
- JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/metadata/audio1.json +1 -0
- JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/metadata/audio2.json +1 -0
- JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/metadata/audio3.json +1 -0
- JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/metadata/audio4.json +1 -0
- JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/metadata/audio5.json +1 -0
- JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/metadata/audio6.json +1 -0
- JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/dataset/__pycache__/audio_processor.cpython-38.pyc +0 -0
- JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/dataset/__pycache__/dataloader.cpython-311.pyc +0 -0
- JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/dataset/__pycache__/dataloader.cpython-38.pyc +0 -0
- JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/dataset/dataloader.py +191 -0
- JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/dataset/demix.py +51 -0
- JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/generation.py +212 -0
- JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/jen1/__pycache__/conditioners.cpython-311.pyc +0 -0
- JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/jen1/__pycache__/conditioners.cpython-38.pyc +0 -0
- JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/jen1/__pycache__/noise_schedule.cpython-38.pyc +0 -0
.gitattributes
CHANGED
@@ -33,3 +33,27 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/audios/htdemucs/audio1/bass.wav filter=lfs diff=lfs merge=lfs -text
|
37 |
+
JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/audios/htdemucs/audio1/drums.wav filter=lfs diff=lfs merge=lfs -text
|
38 |
+
JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/audios/htdemucs/audio1/other.wav filter=lfs diff=lfs merge=lfs -text
|
39 |
+
JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/audios/htdemucs/audio1/vocals.wav filter=lfs diff=lfs merge=lfs -text
|
40 |
+
JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/audios/htdemucs/audio2/bass.wav filter=lfs diff=lfs merge=lfs -text
|
41 |
+
JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/audios/htdemucs/audio2/drums.wav filter=lfs diff=lfs merge=lfs -text
|
42 |
+
JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/audios/htdemucs/audio2/other.wav filter=lfs diff=lfs merge=lfs -text
|
43 |
+
JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/audios/htdemucs/audio2/vocals.wav filter=lfs diff=lfs merge=lfs -text
|
44 |
+
JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/audios/htdemucs/audio3/bass.wav filter=lfs diff=lfs merge=lfs -text
|
45 |
+
JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/audios/htdemucs/audio3/drums.wav filter=lfs diff=lfs merge=lfs -text
|
46 |
+
JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/audios/htdemucs/audio3/other.wav filter=lfs diff=lfs merge=lfs -text
|
47 |
+
JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/audios/htdemucs/audio3/vocals.wav filter=lfs diff=lfs merge=lfs -text
|
48 |
+
JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/audios/htdemucs/audio4/bass.wav filter=lfs diff=lfs merge=lfs -text
|
49 |
+
JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/audios/htdemucs/audio4/drums.wav filter=lfs diff=lfs merge=lfs -text
|
50 |
+
JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/audios/htdemucs/audio4/other.wav filter=lfs diff=lfs merge=lfs -text
|
51 |
+
JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/audios/htdemucs/audio4/vocals.wav filter=lfs diff=lfs merge=lfs -text
|
52 |
+
JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/audios/htdemucs/audio5/bass.wav filter=lfs diff=lfs merge=lfs -text
|
53 |
+
JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/audios/htdemucs/audio5/drums.wav filter=lfs diff=lfs merge=lfs -text
|
54 |
+
JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/audios/htdemucs/audio5/other.wav filter=lfs diff=lfs merge=lfs -text
|
55 |
+
JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/audios/htdemucs/audio5/vocals.wav filter=lfs diff=lfs merge=lfs -text
|
56 |
+
JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/audios/htdemucs/audio6/bass.wav filter=lfs diff=lfs merge=lfs -text
|
57 |
+
JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/audios/htdemucs/audio6/drums.wav filter=lfs diff=lfs merge=lfs -text
|
58 |
+
JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/audios/htdemucs/audio6/other.wav filter=lfs diff=lfs merge=lfs -text
|
59 |
+
JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/audios/htdemucs/audio6/vocals.wav filter=lfs diff=lfs merge=lfs -text
|
JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/JEN1-Composer.jpg
ADDED
![]() |
JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/Jen1-Composer-2.png
ADDED
![]() |
JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/README.md
ADDED
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# JEN-1-COMPOSER-pytorch(WIP)
|
2 |
+
Unofficial implementation JEN-1 Composer: A Unified Framework for High-Fidelity Multi-Track Music Generation(https://arxiv.org/abs/2310.19180)
|
3 |
+
|
4 |
+

|
5 |
+

|
6 |
+
|
7 |
+
## README
|
8 |
+
|
9 |
+
## 📖 Quick Index
|
10 |
+
* [💻 Installation](#-installation)
|
11 |
+
* [🐍Usage](#-method)
|
12 |
+
* [🧠TODO](#-todo)
|
13 |
+
* [🚀Demo](#-demo)
|
14 |
+
* [🙏Appreciation](#-appreciation)
|
15 |
+
* [⭐️Show Your Support](#-show_your_support)
|
16 |
+
* [🙆Welcome Contributions](#-welcom_contributions)
|
17 |
+
|
18 |
+
## 💻 Installation
|
19 |
+
```commandline
|
20 |
+
git clone https://github.com/0417keito/JEN-1-pytorch.git
|
21 |
+
cd JEN-1-pytorch
|
22 |
+
pip install -r requirements.txt
|
23 |
+
```
|
24 |
+
|
25 |
+
## 🐍Usage
|
26 |
+
### Sampling
|
27 |
+
```python
|
28 |
+
import torch
|
29 |
+
from generation import Jen1
|
30 |
+
|
31 |
+
ckpt_path = 'your ckpt path'
|
32 |
+
jen1 = Jen1(ckpt_path)
|
33 |
+
|
34 |
+
prompt = 'a beautiful song'
|
35 |
+
samples = jen1.generate(prompt)
|
36 |
+
```
|
37 |
+
|
38 |
+
### Training
|
39 |
+
```commandline
|
40 |
+
torchrun train.py
|
41 |
+
```
|
42 |
+
|
43 |
+
### Dataset format
|
44 |
+
Json format. the name of the Json file must be the same as the target music file.
|
45 |
+
```json
|
46 |
+
{"prompt": "a beautiful song"}
|
47 |
+
```
|
48 |
+
```python
|
49 |
+
How should the data_dir be created?
|
50 |
+
|
51 |
+
'''
|
52 |
+
dataset_dir
|
53 |
+
├── audios
|
54 |
+
| ├── music1.wav
|
55 |
+
| ├── music2.wav
|
56 |
+
| .......
|
57 |
+
| ├── music{n}.wav
|
58 |
+
|
|
59 |
+
├── metadata
|
60 |
+
| ├── music1.json
|
61 |
+
| ├── music2.json
|
62 |
+
| ......
|
63 |
+
| ├── music{n}.json
|
64 |
+
|
|
65 |
+
'''
|
66 |
+
```
|
67 |
+
|
68 |
+
### About config
|
69 |
+
please see [config.py](https://github.com/0417keito/JEN-1-pytorch/blob/main/utils/config.py) and [conditioner_config.py](https://github.com/0417keito/JEN-1-pytorch/blob/main/utils/conditioner_config.py)
|
70 |
+
|
71 |
+
## 🧠TODO
|
72 |
+
- [ ] Extension to [JEN-1-Composer](https://arxiv.org/abs/2310.19180)
|
73 |
+
- [ ] Extension to music generation with singing voice
|
74 |
+
- [ ] Adaptation of Consistency Model
|
75 |
+
- [ ] In the paper, Diffusion Autoencoder was used, but I did not have much computing resources, so I used Encodec instead. So, if I can afford it, I will implement Diffusion Autoencoder.
|
76 |
+
|
77 |
+
## 🚀Demo
|
78 |
+
coming soon !
|
79 |
+
|
80 |
+
## 🙏Appreciation
|
81 |
+
[Dr Adam Fils](https://github.com/adamfils) - for support and brought this to my attention.
|
82 |
+
|
83 |
+
## ⭐️Show Your Support
|
84 |
+
|
85 |
+
If you find this repo interesting and useful, give us a ⭐️ on GitHub! It encourages us to keep improving the model and adding exciting features.
|
86 |
+
Please inform us of any deficiencies by issue.
|
87 |
+
|
88 |
+
## 🙆Welcome Contributions
|
89 |
+
Contributions are always welcome.
|
90 |
+
|
91 |
+
## Citations
|
92 |
+
```bibtex
|
93 |
+
@misc{2310.19180,
|
94 |
+
Author = {Yao Yao and Peike Li and Boyu Chen and Alex Wang},
|
95 |
+
Title = {JEN-1 Composer: A Unified Framework for High-Fidelity Multi-Track Music Generation},
|
96 |
+
Year = {2023},
|
97 |
+
Eprint = {arXiv:2310.19180},
|
98 |
+
}
|
99 |
+
```
|
JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/__pycache__/trainer.cpython-311.pyc
ADDED
Binary file (25.2 kB). View file
|
|
JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/audios/audio1.mp3
ADDED
Binary file (481 kB). View file
|
|
JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/audios/audio2.mp3
ADDED
Binary file (481 kB). View file
|
|
JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/audios/audio3.mp3
ADDED
Binary file (481 kB). View file
|
|
JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/audios/audio4.mp3
ADDED
Binary file (481 kB). View file
|
|
JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/audios/audio5.mp3
ADDED
Binary file (481 kB). View file
|
|
JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/audios/audio6.mp3
ADDED
Binary file (481 kB). View file
|
|
JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/audios/htdemucs/audio1/bass.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3f6bc7ca834b5899a1f35e0294213ab0b74a305c718639af75b5d5f079cd7ca7
|
3 |
+
size 5292044
|
JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/audios/htdemucs/audio1/drums.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1e0a30f4be76598d557f5eb3e4c616f104d8b6624484de64ef040acf790d2380
|
3 |
+
size 5292044
|
JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/audios/htdemucs/audio1/other.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:39447a94082b91122e123e93edaa7fae768e796d6092eb1471f67e38a30d5253
|
3 |
+
size 5292044
|
JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/audios/htdemucs/audio1/vocals.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d3c51fb6ffd95e13cc866c499037f84a972985decb9cdd93c184fa27cd464458
|
3 |
+
size 5292044
|
JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/audios/htdemucs/audio2/bass.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b8fb9f3d14c2f7a9c6792fe686fafc9a9be9e8169ab5efd481bd274156af2f18
|
3 |
+
size 5292044
|
JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/audios/htdemucs/audio2/drums.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:aad842cce67224f86c8bd3aa8e6cc673467b3f02dc39883be66a662b3c34363d
|
3 |
+
size 5292044
|
JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/audios/htdemucs/audio2/other.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c4d1bee50855e4c06252e665300a9ff02b1aa1c66822bb60da0119ec950b0299
|
3 |
+
size 5292044
|
JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/audios/htdemucs/audio2/vocals.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ca8c87f0b2569e52d499601463b4f0f394bad3e0c8002acca3e0aad25bc356f8
|
3 |
+
size 5292044
|
JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/audios/htdemucs/audio3/bass.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2bf1d2592748b07e4a8553aae79eacb25b1d19c50f2ec7b658c9a224138e18d8
|
3 |
+
size 5292044
|
JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/audios/htdemucs/audio3/drums.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:df8950751972beb6fc0f628a6c53e230f0eb785e68d9b3f8c8ea62895aaa40c5
|
3 |
+
size 5292044
|
JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/audios/htdemucs/audio3/other.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c003b116a0f3e56addfb7de9b3fe5e60c0a70e8d0d1f4af28b7e4bebd0639c2e
|
3 |
+
size 5292044
|
JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/audios/htdemucs/audio3/vocals.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b9c62024fc680006609ee8c01c0bf8092287b7f8d435a979cf200eeb3de3b10f
|
3 |
+
size 5292044
|
JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/audios/htdemucs/audio4/bass.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:93dee392485d0c3d3f9ca3cd312dae648a5840135cf272df631a0c815fbd0449
|
3 |
+
size 5292044
|
JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/audios/htdemucs/audio4/drums.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ed3e0f48e874df199d8f95392504dde16d727da2396bb3427704bd7ec105c5a1
|
3 |
+
size 5292044
|
JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/audios/htdemucs/audio4/other.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1660aa65126295133d2fe1cc1ba12643c2679e13ecfa864e5e2b308c10a3812d
|
3 |
+
size 5292044
|
JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/audios/htdemucs/audio4/vocals.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e986bb445f169d4391a56fcebc5f55e8c39bcff752f61ccbf477161b303a1f69
|
3 |
+
size 5292044
|
JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/audios/htdemucs/audio5/bass.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b99f55edf4947722b0e0ed6b76099827f21eae020edac341d20f798758b615c5
|
3 |
+
size 5292044
|
JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/audios/htdemucs/audio5/drums.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:eefadf55c12881632f35cf171e522117cc7e19c18683ece2887d2f9d5f79cb3a
|
3 |
+
size 5292044
|
JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/audios/htdemucs/audio5/other.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a1da07f1d5c02b66d7d9091cce8bd8e5ddbc3aea1bd1f3bcb1eea8ff36abbd60
|
3 |
+
size 5292044
|
JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/audios/htdemucs/audio5/vocals.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6b1411e7c0e128bc457873e011de7896f191f0bdee89c789e73a3d8c67df75f6
|
3 |
+
size 5292044
|
JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/audios/htdemucs/audio6/bass.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8dcafd9ecc1a257897feacd719ce27359c48fd0115d6593d1b747ec2a32613c1
|
3 |
+
size 5292044
|
JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/audios/htdemucs/audio6/drums.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0cf62a180ea5d3939eec617a6dfa71ec8cc4a8cfe9fe90d282faf2aa2b698113
|
3 |
+
size 5292044
|
JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/audios/htdemucs/audio6/other.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bca9d5f51d1118a5c48228fd13825479e386bd3eedb929cdc3b5f71bb5db5f25
|
3 |
+
size 5292044
|
JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/audios/htdemucs/audio6/vocals.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5cc00cd8b79e898899f6d24a4216c7ccae407760bd2750746f652e07df8c2233
|
3 |
+
size 5292044
|
JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/metadata/audio1.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"prompt": "a beautiful song"}
|
JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/metadata/audio2.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"prompt": "a beautiful song"}
|
JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/metadata/audio3.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"prompt": "a beautiful song"}
|
JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/metadata/audio4.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"prompt": "a beautiful song"}
|
JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/metadata/audio5.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"prompt": "a beautiful song"}
|
JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/data/metadata/audio6.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"prompt": "a beautiful song"}
|
JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/dataset/__pycache__/audio_processor.cpython-38.pyc
ADDED
Binary file (3.84 kB). View file
|
|
JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/dataset/__pycache__/dataloader.cpython-311.pyc
ADDED
Binary file (14.8 kB). View file
|
|
JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/dataset/__pycache__/dataloader.cpython-38.pyc
ADDED
Binary file (6.55 kB). View file
|
|
JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/dataset/dataloader.py
ADDED
@@ -0,0 +1,191 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import time
|
3 |
+
import math
|
4 |
+
import random
|
5 |
+
import json
|
6 |
+
import torch
|
7 |
+
import torchaudio
|
8 |
+
from torch.utils.data import Dataset, DataLoader, random_split
|
9 |
+
|
10 |
+
from encodec import EncodecModel
|
11 |
+
from encodec.utils import convert_audio
|
12 |
+
|
13 |
+
|
14 |
+
class MusicDataset(Dataset):
|
15 |
+
def __init__(self, dataset_dir, sr, channels, min_duration, max_duration,
|
16 |
+
sample_duration, aug_shift, device, composer=False):
|
17 |
+
super().__init__()
|
18 |
+
self.dataset_dir = dataset_dir
|
19 |
+
self.sr = sr
|
20 |
+
self.channels = channels
|
21 |
+
self.min_duration = min_duration
|
22 |
+
self.max_duration = max_duration
|
23 |
+
self.sample_duration = sample_duration
|
24 |
+
self.aug_shift = aug_shift
|
25 |
+
self.device = device
|
26 |
+
self.model = EncodecModel.encodec_model_48khz().to(device=self.device)
|
27 |
+
self.audio_files_dir = f'{dataset_dir}/audios'
|
28 |
+
self.metadatas_dir = f'{dataset_dir}/metadata'
|
29 |
+
self.composer = composer
|
30 |
+
if composer:
|
31 |
+
self.demix_dir = f'{self.audio_files_dir}/htdemucs'
|
32 |
+
self.init_dataset()
|
33 |
+
|
34 |
+
def get_duration_sec(self, file):
|
35 |
+
wav, sr = torchaudio.load(file)
|
36 |
+
duration_sec = wav.shape[-1] / sr
|
37 |
+
return duration_sec
|
38 |
+
|
39 |
+
def filter(self, audio_files, durations):
|
40 |
+
keep = []
|
41 |
+
self.audio_files = []
|
42 |
+
for i in range(len(audio_files)):
|
43 |
+
filepath = audio_files[i]
|
44 |
+
if durations[i] / self.sr < self.min_duration:
|
45 |
+
continue
|
46 |
+
if durations[i] / self.sr >= self.max_duration:
|
47 |
+
continue
|
48 |
+
keep.append(i)
|
49 |
+
self.audio_files.append(filepath)
|
50 |
+
self.durations = [durations[i] for i in keep] # in (s)
|
51 |
+
duration_tensor = torch.tensor(self.durations)
|
52 |
+
self.cumsum = torch.cumsum(duration_tensor, dim=0) # in (s)
|
53 |
+
|
54 |
+
def init_dataset(self):
|
55 |
+
audio_files = os.listdir(self.audio_files_dir)
|
56 |
+
audio_files = [f'{self.audio_files_dir}/{file}' for file in audio_files if file.endswith('.wav') or file.endswith('.mp3')]
|
57 |
+
durations = [self.get_duration_sec(file) for file in audio_files]
|
58 |
+
self.filter(audio_files=audio_files, durations=durations)
|
59 |
+
|
60 |
+
def get_index_offset(self, item):
|
61 |
+
half_interval = self.sample_duration // 2
|
62 |
+
shift = random.randint(-half_interval, half_interval) if self.aug_shift else 0
|
63 |
+
offset = item * self.sample_duration + shift
|
64 |
+
midpoint = offset + half_interval
|
65 |
+
assert 0 <= midpoint < self.cumsum[-1], f'Midpoint {midpoint} of item beyond total length {self.cumsum[-1]}'
|
66 |
+
index = torch.searchsorted(self.cumsum, midpoint)
|
67 |
+
start, end = self.cumsum[index-1] if index > 0 else 0.0, self.cumsum[index]
|
68 |
+
assert start <= midpoint <= end, f'Midpoint {midpoint} not inside interval [{start}, {end}] for index {index}'
|
69 |
+
if offset > end - self.sample_duration:
|
70 |
+
offset = max(start, offset - half_interval)
|
71 |
+
elif offset < start:
|
72 |
+
offset = min(end - self.sample_duration, offset + half_interval)
|
73 |
+
assert start <= offset <= end - self.sample_duration, f'Offset {offset} not in [{start}, {end} for index {index}]'
|
74 |
+
offset = offset - start
|
75 |
+
return index, offset
|
76 |
+
|
77 |
+
def get_song_chunk(self, index, offset):
|
78 |
+
audio_file_path = self.audio_files[index]
|
79 |
+
song_name = os.path.splitext(os.path.basename(audio_file_path))[0]
|
80 |
+
wav, sr = torchaudio.load(audio_file_path)
|
81 |
+
|
82 |
+
start_sample = int(offset * sr)
|
83 |
+
end_sample = start_sample + int(self.sample_duration * sr)
|
84 |
+
chunk = wav[:, start_sample:end_sample]
|
85 |
+
if self.composer:
|
86 |
+
demix_chunks = {}
|
87 |
+
demix_file_dict = {'bass': f'{self.demix_dir}/{song_name}/bass.wav',
|
88 |
+
'drums': f'{self.demix_dir}/{song_name}/drums.wav',
|
89 |
+
'other': f'{self.demix_dir}/{song_name}/other.wav'}
|
90 |
+
for key, value in demix_file_dict.items():
|
91 |
+
demix_chunk, demix_sr = torchaudio.load(value)
|
92 |
+
start_sample = int(offset * demix_sr)
|
93 |
+
end_sample = start_sample + int(self.sample_duration * demix_sr)
|
94 |
+
demix_chunks[key] = {'demix_chunk': demix_chunk[:, start_sample:end_sample],
|
95 |
+
'demix_sr': demix_sr}
|
96 |
+
|
97 |
+
return chunk, sr, demix_chunks
|
98 |
+
#chunk = chunk.unsqueeze(0)
|
99 |
+
|
100 |
+
return chunk, sr
|
101 |
+
|
102 |
+
def __len__(self):
|
103 |
+
return len(self.durations)
|
104 |
+
|
105 |
+
def __getitem__(self, item):
|
106 |
+
index, offset = self.get_index_offset(item)
|
107 |
+
if self.composer:
|
108 |
+
chunk, sr, demix_chunks = self.get_song_chunk(item, offset)
|
109 |
+
else:
|
110 |
+
chunk, sr = self.get_song_chunk(item, offset)
|
111 |
+
song_name = os.path.splitext(os.path.basename(self.audio_files[index]))[0]
|
112 |
+
if os.path.exists(f'{self.metadatas_dir}/{song_name}.json'):
|
113 |
+
with open(f'{self.metadatas_dir}/{song_name}.json', 'r') as file:
|
114 |
+
metadata = json.load(file)
|
115 |
+
chunk = convert_audio(chunk, sr, self.model.sample_rate, self.model.channels)
|
116 |
+
chunk = chunk.unsqueeze(0).to(device=self.device)
|
117 |
+
with torch.no_grad():
|
118 |
+
encoded_frames = self.model.encode(chunk)
|
119 |
+
chunk = chunk.mean(0, keepdim=True)
|
120 |
+
codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1)
|
121 |
+
codes = codes.transpose(0, 1)
|
122 |
+
emb = self.model.quantizer.decode(codes)
|
123 |
+
emb = emb.to(self.device)
|
124 |
+
|
125 |
+
demix_embs = None
|
126 |
+
if self.composer:
|
127 |
+
demix_embs = {}
|
128 |
+
for key, value in demix_chunks.items():
|
129 |
+
demix_chunk = value['demix_chunk']
|
130 |
+
demix_sr = value['demix_sr']
|
131 |
+
demix_chunk = convert_audio(demix_chunk, demix_sr, self.model.sample_rate, self.model.channels)
|
132 |
+
demix_chunk = demix_chunk.unsqueeze(0).to(device=self.device)
|
133 |
+
with torch.no_grad():
|
134 |
+
demix_encoded_frames = self.model.encode(demix_chunk)
|
135 |
+
demix_chunk = demix_chunk.mean(0, keepdim=True)
|
136 |
+
demix_codes = torch.cat([encoded[0] for encoded in demix_encoded_frames], dim=-1)
|
137 |
+
demix_codes = demix_codes.transpose(0, 1)
|
138 |
+
demix_emb = self.model.quantizer.decode(demix_codes)
|
139 |
+
demix_emb = demix_emb.to(self.device)
|
140 |
+
demix_embs[key] = demix_emb
|
141 |
+
|
142 |
+
return chunk, metadata, emb, demix_embs
|
143 |
+
|
144 |
+
def collate(batch):
|
145 |
+
device = 'cuda' if torch.cuda.is_available else 'cpu'
|
146 |
+
audio, data, emb, demix_embs = zip(*batch)
|
147 |
+
audio = torch.cat(audio, dim=0)
|
148 |
+
emb = torch.cat(emb, dim=0)
|
149 |
+
demix_embs_dict = None
|
150 |
+
metadata = [d for d in data]
|
151 |
+
keys = demix_embs[0].keys()
|
152 |
+
tensors_dict = {key: [] for key in keys}
|
153 |
+
if demix_embs is not None:
|
154 |
+
for d in demix_embs:
|
155 |
+
for key in keys:
|
156 |
+
tensors_dict[key].append(d[key])
|
157 |
+
demix_embs_dict = {key: torch.cat(tensors_dict[key], dim=0) for key in keys}
|
158 |
+
return (emb, metadata, demix_embs_dict)
|
159 |
+
|
160 |
+
return (emb, metadata, demix_embs_dict)
|
161 |
+
|
162 |
+
def get_dataloader(dataset_folder, batch_size: int = 50, shuffle: bool = True):
|
163 |
+
dataset = MusicDataset(dataset_folder)
|
164 |
+
|
165 |
+
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, collate_fn=collate)
|
166 |
+
return dataloader
|
167 |
+
|
168 |
+
|
169 |
+
def get_dataloaders(dataset_dir, sr, channels, min_duration, max_duration, sample_duration,
|
170 |
+
aug_shift, batch_size: int = 50, shuffle: bool = True, split_ratio=0.8, device='cpu',
|
171 |
+
composer=False):
|
172 |
+
if not isinstance(dataset_dir, tuple):
|
173 |
+
dataset = MusicDataset(dataset_dir=dataset_dir, sr=sr, channels=channels,
|
174 |
+
min_duration=min_duration, max_duration=max_duration, sample_duration=sample_duration,
|
175 |
+
aug_shift=aug_shift, device=device, composer=composer)
|
176 |
+
# Split the dataset into train and validation
|
177 |
+
train_size = int(split_ratio * len(dataset))
|
178 |
+
val_size = len(dataset) - train_size
|
179 |
+
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
|
180 |
+
else:
|
181 |
+
train_dir, valid_dir = dataset_dir
|
182 |
+
train_dataset = MusicDataset(dataset_dir=train_dir, sr=sr, channels=channels,
|
183 |
+
min_duration=min_duration, max_duration=max_duration, sample_duration=sample_duration,
|
184 |
+
aug_shift=aug_shift, device=device, composer=composer)
|
185 |
+
val_dataset = MusicDataset(dataset_dir=valid_dir, sr=sr, channels=channels,
|
186 |
+
min_duration=min_duration, max_duration=max_duration, sample_duration=sample_duration,
|
187 |
+
aug_shift=aug_shift, device=device, composer=composer)
|
188 |
+
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=shuffle, collate_fn=collate, drop_last=True)
|
189 |
+
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate, drop_last=True)
|
190 |
+
|
191 |
+
return train_dataloader, val_dataloader
|
JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/dataset/demix.py
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
import subprocess
|
3 |
+
import torch
|
4 |
+
|
5 |
+
from pathlib import Path
|
6 |
+
from typing import List, Union
|
7 |
+
|
8 |
+
|
9 |
+
def demix(paths: List[Path], demix_dir: Path, device: Union[str, torch.device]):
|
10 |
+
"""Demixes the audio file into its sources."""
|
11 |
+
todos = []
|
12 |
+
demix_paths = []
|
13 |
+
for path in paths:
|
14 |
+
out_dir = demix_dir / 'htdemucs' / path.stem
|
15 |
+
demix_paths.append(out_dir)
|
16 |
+
if out_dir.is_dir():
|
17 |
+
if (
|
18 |
+
(out_dir / 'bass.wav').is_file() and
|
19 |
+
(out_dir / 'drums.wav').is_file() and
|
20 |
+
(out_dir / 'other.wav').is_file() and
|
21 |
+
(out_dir / 'vocals.wav').is_file()
|
22 |
+
):
|
23 |
+
continue
|
24 |
+
todos.append(path)
|
25 |
+
|
26 |
+
existing = len(paths) - len(todos)
|
27 |
+
print(f'=> Found {existing} tracks already demixed, {len(todos)} to demix.')
|
28 |
+
|
29 |
+
if todos:
|
30 |
+
subprocess.run(
|
31 |
+
[
|
32 |
+
sys.executable, '-m', 'demucs.separate',
|
33 |
+
'--out', demix_dir.as_posix(),
|
34 |
+
'--name', 'htdemucs',
|
35 |
+
'--device', str(device),
|
36 |
+
*[path.as_posix() for path in todos],
|
37 |
+
],
|
38 |
+
check=True,
|
39 |
+
)
|
40 |
+
|
41 |
+
return demix_paths
|
42 |
+
|
43 |
+
def find_audio_files(directory: Path):
|
44 |
+
return list(directory.rglob('*.mp3'))
|
45 |
+
|
46 |
+
if __name__ == '__main__':
|
47 |
+
dataset_dir = '/home/keito/data/audios'
|
48 |
+
audio_dir = Path(dataset_dir)
|
49 |
+
audio_paths = find_audio_files(audio_dir)
|
50 |
+
demixed_paths = demix(audio_paths, audio_dir, "cuda")
|
51 |
+
print('demixed_paths', demixed_paths)
|
JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/generation.py
ADDED
@@ -0,0 +1,212 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import random
|
3 |
+
import numpy as np
|
4 |
+
import math
|
5 |
+
|
6 |
+
from utils.script_util import create_multi_conditioner, load_checkpoint
|
7 |
+
from utils.config import Config
|
8 |
+
|
9 |
+
from encodec import EncodecModel
|
10 |
+
from encodec.utils import convert_audio
|
11 |
+
|
12 |
+
from jen1.diffusion.gdm.gdm import GaussianDiffusion
|
13 |
+
from jen1.model.model import UNetCFG1d
|
14 |
+
from jen1.diffusion.gdm.noise_schedule import get_beta_schedule
|
15 |
+
|
16 |
+
class Jen1():
|
17 |
+
def __init__(self,
|
18 |
+
ckpt_path,
|
19 |
+
device='cuda' if torch.cuda.is_available() else 'cpu',
|
20 |
+
sample_rate = 48000,
|
21 |
+
cross_attn_cond_ids=['prompt'],
|
22 |
+
global_cond_ids= [],
|
23 |
+
input_concat_ids= ['masked_input', 'mask']):
|
24 |
+
self.ckpt_path = ckpt_path
|
25 |
+
self.device = device
|
26 |
+
self.sample_rate = sample_rate
|
27 |
+
self.config = Config
|
28 |
+
self.conditioner = create_multi_conditioner(self.config.conditioner_config)
|
29 |
+
self.cross_attn_cond_ids = cross_attn_cond_ids
|
30 |
+
self.global_cond_ids = global_cond_ids
|
31 |
+
self.input_concat_ids = input_concat_ids
|
32 |
+
|
33 |
+
self.audio_encoder = EncodecModel.encodec_model_48khz()
|
34 |
+
|
35 |
+
def get_model_and_diffusion(self, steps, use_gdm):
|
36 |
+
if use_gdm:
|
37 |
+
diffusion_config = self.config.diffusion_config.gaussian_diffusion
|
38 |
+
else:
|
39 |
+
diffusion_config = self.config.diffusion_config.variational_diffusion
|
40 |
+
model_config = self.config.model_config
|
41 |
+
|
42 |
+
if use_gdm:
|
43 |
+
betas, alphas = get_beta_schedule(diffusion_config.noise_schedule, diffusion_config.steps)
|
44 |
+
betas = betas.to(self.device)
|
45 |
+
betas = betas.to(torch.float32)
|
46 |
+
if alphas is not None:
|
47 |
+
alphas.to(self.device)
|
48 |
+
alphas = alphas.to(torch.float32)
|
49 |
+
diffusion = GaussianDiffusion(steps=diffusion_config.steps, betas=betas, alphas=alphas,
|
50 |
+
objective=diffusion_config.objective, loss_type=diffusion_config.loss_type,
|
51 |
+
device=self.device, cfg_dropout_proba=diffusion_config.cfg_dropout_proba,
|
52 |
+
embedding_scale=diffusion_config.embedding_scale,
|
53 |
+
batch_cfg=diffusion_config.batch_cfg, scale_cfg=diffusion_config.scale_cfg,
|
54 |
+
sampling_timesteps=steps, use_fp16=False)
|
55 |
+
|
56 |
+
config_dict = {k: v for k, v in model_config.__dict__.items() if not k.startswith('__') and not callable(v)}
|
57 |
+
context_embedding_features = config_dict.pop('context_embedding_features', None)
|
58 |
+
context_embedding_max_length = config_dict.pop('context_embedding_max_length', None)
|
59 |
+
|
60 |
+
model = UNetCFG1d(context_embedding_features=context_embedding_features,
|
61 |
+
context_embedding_max_length=context_embedding_max_length,
|
62 |
+
**config_dict).to(self.device)
|
63 |
+
|
64 |
+
#model, _, _, _ = load_checkpoint(self.ckpt_path, model)
|
65 |
+
model.eval()
|
66 |
+
diffusion.eval()
|
67 |
+
|
68 |
+
return diffusion, model
|
69 |
+
|
70 |
+
def generate(self, prompt, seed=-1, steps=100, batch_size=1, seconds=30, use_gdm=True,
|
71 |
+
task='text_guided', init_audio=None, init_audio_sr=None, inpainting_scope=None):
|
72 |
+
|
73 |
+
seed = seed if seed != -1 else np.random.randint(0, 2**32 -1)
|
74 |
+
torch.manual_seed(seed)
|
75 |
+
self.batch_size = batch_size
|
76 |
+
|
77 |
+
diffusion, model = self.get_model_and_diffusion(steps, use_gdm)
|
78 |
+
|
79 |
+
if init_audio is not None and init_audio.size() != 3:
|
80 |
+
init_audio = init_audio.repeat(batch_size, 1, 1)
|
81 |
+
|
82 |
+
if init_audio is None:
|
83 |
+
flag = True
|
84 |
+
sample_length = seconds * self.sample_rate
|
85 |
+
shape = (batch_size, self.audio_encoder.channels, sample_length)
|
86 |
+
init_audio = torch.zeros(shape)
|
87 |
+
init_audio_sr = self.sample_rate
|
88 |
+
|
89 |
+
init_audio = convert_audio(init_audio, init_audio_sr, self.sample_rate, self.audio_encoder.channels)
|
90 |
+
|
91 |
+
if task == 'text_guided':
|
92 |
+
mask = self.get_mask(sample_length, 0, seconds, batch_size)
|
93 |
+
causal = False
|
94 |
+
elif task == 'music_inpaint':
|
95 |
+
mask = self.get_mask(sample_length, inpainting_scope[0], inpainting_scope[1], batch_size)
|
96 |
+
causal = False
|
97 |
+
elif task == 'music_cont':
|
98 |
+
cont_length = sample_length - init_audio.size(2)
|
99 |
+
cont_start = init_audio.size(2)
|
100 |
+
mask = self.get_mask(sample_length, cont_start/self.sample_rate, seconds, batch_size)
|
101 |
+
cont_audio = torch.randn(batch_size, self.audio_encoder.channels, cont_length, device=self.device)
|
102 |
+
cont_audio = cont_audio * mask[:, cont_start:]
|
103 |
+
init_audio = torch.cat([init_audio, cont_audio], dim=2)
|
104 |
+
causal = True
|
105 |
+
|
106 |
+
with torch.no_grad():
|
107 |
+
init_emb = self.get_emb(init_audio).to(self.device)
|
108 |
+
emb_shape = init_emb.shape
|
109 |
+
mask = mask.to(self.device)
|
110 |
+
|
111 |
+
mask = torch.nn.functional.interpolate(mask, size=(emb_shape[2]))
|
112 |
+
masked_emb = init_emb * mask
|
113 |
+
if flag:
|
114 |
+
init_emb = None
|
115 |
+
batch_metadata = [{'prompt': prompt} for _ in range(batch_size)]
|
116 |
+
conditioning = self.conditioner(batch_metadata, self.device)
|
117 |
+
conditioning['masked_input'] = masked_emb
|
118 |
+
conditioning['mask'] = mask
|
119 |
+
conditioning = self.get_conditioning(conditioning)
|
120 |
+
|
121 |
+
sample_embs = diffusion.sample(model, emb_shape, conditioning, causal, init_data=init_emb)
|
122 |
+
sample_embs = sample_embs.to('cpu')
|
123 |
+
samples = self.audio_encoder.decoder(sample_embs)
|
124 |
+
|
125 |
+
return samples
|
126 |
+
|
127 |
+
def get_mask(self, sample_size, start, end, batch_size):
|
128 |
+
masks = []
|
129 |
+
maskstart = math.floor(start * self.sample_rate)
|
130 |
+
maskend = math.ceil(end * self.sample_rate)
|
131 |
+
mask = torch.ones((1, 1, sample_size))
|
132 |
+
mask[:, :, maskstart:maskend] = 0
|
133 |
+
masks.append(mask)
|
134 |
+
mask = torch.concat(masks * batch_size, dim=0)
|
135 |
+
|
136 |
+
return mask
|
137 |
+
|
138 |
+
def get_emb(self, audio):
|
139 |
+
encoded_frames = self.audio_encoder.encode(audio)
|
140 |
+
codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1)
|
141 |
+
codes = codes.transpose(0, 1)
|
142 |
+
emb = self.audio_encoder.quantizer.decode(codes)
|
143 |
+
return emb
|
144 |
+
|
145 |
+
def get_conditioning(self, cond):
|
146 |
+
cross_attention_input = None
|
147 |
+
cross_attention_masks = None
|
148 |
+
global_cond = None
|
149 |
+
input_concat_cond = None
|
150 |
+
|
151 |
+
if len(self.cross_attn_cond_ids) > 0:
|
152 |
+
# Concatenate all cross-attention inputs over the sequence dimension
|
153 |
+
# Assumes that the cross-attention inputs are of shape (batch, seq, channels)
|
154 |
+
cross_attention_input = torch.cat([cond[key][0] for key in self.cross_attn_cond_ids], dim=1)
|
155 |
+
cross_attention_masks = torch.cat([cond[key][1] for key in self.cross_attn_cond_ids], dim=1)
|
156 |
+
|
157 |
+
if len(self.global_cond_ids) > 0:
|
158 |
+
# Concatenate all global conditioning inputs over the channel dimension
|
159 |
+
# Assumes that the global conditioning inputs are of shape (batch, channels)
|
160 |
+
global_cond = torch.cat([cond[key][0] for key in self.global_cond_ids], dim=-1)
|
161 |
+
if len(global_cond.shape) == 3:
|
162 |
+
global_cond = global_cond.squeeze(1)
|
163 |
+
|
164 |
+
if len(self.input_concat_ids) > 0:
|
165 |
+
concated_tensors = []
|
166 |
+
for key in self.input_concat_ids:
|
167 |
+
tensor = cond[key][0]
|
168 |
+
|
169 |
+
if tensor.ndim == 2:
|
170 |
+
tensor = tensor.unsqueeze(0)
|
171 |
+
tensor = tensor.expand(self.batch_size, -1, -1)
|
172 |
+
|
173 |
+
concated_tensors.append(tensor)
|
174 |
+
# Concatenate all input concat conditioning inputs over the channel dimension
|
175 |
+
# Assumes that the input concat conditioning inputs are of shape (batch, channels, seq)
|
176 |
+
#input_concat_cond = torch.cat([cond[key][0] for key in self.input_concat_ids], dim=1)
|
177 |
+
#For some reason, the BATCH component is removed. I don't know why.
|
178 |
+
input_concat_cond = torch.cat(concated_tensors, dim=1)
|
179 |
+
|
180 |
+
return {
|
181 |
+
"cross_attn_cond": cross_attention_input,
|
182 |
+
"cross_attn_masks": cross_attention_masks,
|
183 |
+
"global_cond": global_cond,
|
184 |
+
"input_concat_cond": input_concat_cond
|
185 |
+
}
|
186 |
+
|
187 |
+
def save_audio_tensor(audio_tensor: torch.Tensor, file_path: str, sample_rate: int = 48000):
|
188 |
+
print(f'Saving audio to {file_path}')
|
189 |
+
"""
|
190 |
+
Saves an audio tensor to a file.
|
191 |
+
Params:
|
192 |
+
audio_tensor: torch.Tensor, The audio data to save.
|
193 |
+
file_path: str, The path to the file where the audio will be saved.
|
194 |
+
sample_rate: int, The sample rate of the audio data.
|
195 |
+
Returns:
|
196 |
+
None
|
197 |
+
"""
|
198 |
+
# Ensure the tensor is on the CPU before saving
|
199 |
+
audio_tensor = audio_tensor.detach()
|
200 |
+
print(f'audio_tensor.shape: {audio_tensor.shape}')
|
201 |
+
if audio_tensor.ndim == 3:
|
202 |
+
audio_tensor = audio_tensor.squeeze(0) # Remove the batch dimension
|
203 |
+
# Use torchaudio to save the tensor as an audio file
|
204 |
+
import torchaudio
|
205 |
+
torchaudio.save(file_path, audio_tensor, sample_rate)
|
206 |
+
print(f'Saved audio to {file_path}')
|
207 |
+
|
208 |
+
if __name__ == '__main__':
|
209 |
+
jen1 = Jen1(ckpt_path=None)
|
210 |
+
prompt = 'a beautiful song'
|
211 |
+
samples = jen1.generate(prompt=prompt)
|
212 |
+
save_audio_tensor(samples, 'samples.wav')
|
JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/jen1/__pycache__/conditioners.cpython-311.pyc
ADDED
Binary file (12.4 kB). View file
|
|
JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/jen1/__pycache__/conditioners.cpython-38.pyc
ADDED
Binary file (6.53 kB). View file
|
|
JEN-1-COMPOSER-pytorch-main/JEN-1-COMPOSER-pytorch-main/jen1/__pycache__/noise_schedule.cpython-38.pyc
ADDED
Binary file (1.11 kB). View file
|
|