Spaces:

thecollabagepatch
/

magenta

Paused

App Files Files Community

thecollabagepatch commited on about 22 hours ago

Commit

c4aed03

1 Parent(s): 09491d9

drop first bar

Browse files

Files changed (1) hide show

app.py +48 -56

app.py CHANGED Viewed

@@ -168,94 +168,84 @@ def generate_loop_continuation_with_mrt(
     style_weights=None,
     bars: int = 8,
     beats_per_bar: int = 4,
-    loop_weight: float = 1.0,           # NEW
-    loudness_mode: str = "auto",        # "auto"|"lufs"|"rms"|"none"
-    loudness_headroom_db: float = 1.0,  # for the peak guard
 ):
-    # Load loop & put into model SR/channels
     loop = au.Waveform.from_file(input_wav_path).resample(mrt.sample_rate).as_stereo()
-    # Compute the model's desired context seconds (e.g., 250 frames / 25 fps = 10s)
-    codec_fps = float(mrt.codec.frame_rate)
-    ctx_seconds = float(mrt.config.context_length_frames) / codec_fps   # typically 10.0s
-    # ✅ NEW: take bar-aligned TAIL for context, if input is long enough
-    loop_for_context = take_bar_aligned_tail(
-        wav=loop,
-        bpm=bpm,
-        beats_per_bar=beats_per_bar,
-        ctx_seconds=ctx_seconds
-    )
-    print(f"[MRT] context tail: {ctx_seconds:.2f}s ≈ {loop_for_context.samples.shape[0]/loop_for_context.sample_rate:.2f}s, "
-      f"sr={loop_for_context.sample_rate}")
-    # Encode ONLY the tail (so we condition on recent audio)
     tokens_full = mrt.codec.encode(loop_for_context).astype(np.int32)
     tokens = tokens_full[:, :mrt.config.decoder_codec_rvq_depth]
-    # Context
     context_tokens = make_bar_aligned_context(
-        tokens,
-        bpm=bpm,
-        fps=int(mrt.codec.frame_rate),
-        ctx_frames=mrt.config.context_length_frames,
-        beats_per_bar=beats_per_bar,
     )
     state = mrt.init_state()
     state.context_tokens = context_tokens
-    # ---------- STYLE: weighted avg into ONE vector ----------
-    # Base embed from loop with adjustable loop_weight
-    embeds = []
-    weights = []
-    # loop embedding
-    loop_embed = mrt.embed_style(loop)
-    embeds.append(loop_embed)
-    weights.append(float(loop_weight))  # <--- use requested loop weight
-    # extra styles
     if extra_styles:
         for i, s in enumerate(extra_styles):
             if s.strip():
                 embeds.append(mrt.embed_style(s.strip()))
                 w = style_weights[i] if (style_weights and i < len(style_weights)) else 1.0
                 weights.append(float(w))
-    # Prevent all-zero weights; normalize
-    wsum = float(sum(weights))
-    if wsum <= 0.0:
-        # fallback: rely on loop to avoid NaNs
-        weights = [1.0] + [0.0] * (len(weights) - 1)
-        wsum = 1.0
     weights = [w / wsum for w in weights]
-    # weighted sum -> single style vector (match dtype)
     combined_style = np.sum([w * e for w, e in zip(weights, embeds)], axis=0).astype(loop_embed.dtype)
-    # Chunks to cover exact bars
     seconds_per_bar = beats_per_bar * (60.0 / bpm)
-    total_secs = bars * seconds_per_bar
     chunk_secs = mrt.config.chunk_length_frames * mrt.config.frame_length_samples / mrt.sample_rate  # ~2.0
-    steps = int(math.ceil(total_secs / chunk_secs)) + 1  # pad then trim
     # Generate
     chunks = []
     for _ in range(steps):
-        wav, state = mrt.generate_chunk(state=state, style=combined_style)  # ONE style vector
         chunks.append(wav)
-    # Stitch -> trim -> polish
-    out = stitch_generated(chunks, mrt.sample_rate, mrt.config.crossfade_length).as_stereo()
-    out = hard_trim_seconds(out, total_secs).peak_normalize(0.95)
     apply_micro_fades(out, 5)
-    # Loudness match to the *input loop* so the return level feels consistent
     out, loud_stats = match_loudness_to_reference(
         ref=loop, target=out,
-        method=loudness_mode,
-        headroom_db=loudness_headroom_db,
     )
     return out, loud_stats
 # ----------------------------
@@ -296,7 +286,8 @@ def generate(
     guidance_weight: float = Form(5.0),
     temperature: float = Form(1.1),
     topk: int = Form(40),
-    target_sample_rate: int | None = Form(None),  # <-- add this
 ):
     # Read file
     data = loop_audio.file.read()
@@ -327,6 +318,7 @@ def generate(
             loop_weight=loop_weight,
             loudness_mode=loudness_mode,
             loudness_headroom_db=loudness_headroom_db,
         )
     # 1) Figure out the desired SR

     style_weights=None,
     bars: int = 8,
     beats_per_bar: int = 4,
+    loop_weight: float = 1.0,
+    loudness_mode: str = "auto",
+    loudness_headroom_db: float = 1.0,
+    intro_bars_to_drop: int = 0,             # <— NEW
 ):
+    # Load & prep (unchanged)
     loop = au.Waveform.from_file(input_wav_path).resample(mrt.sample_rate).as_stereo()
+    # Use tail for context (your recent change)
+    codec_fps   = float(mrt.codec.frame_rate)
+    ctx_seconds = float(mrt.config.context_length_frames) / codec_fps
+    loop_for_context = take_bar_aligned_tail(loop, bpm, beats_per_bar, ctx_seconds)
     tokens_full = mrt.codec.encode(loop_for_context).astype(np.int32)
     tokens = tokens_full[:, :mrt.config.decoder_codec_rvq_depth]
+    # Bar-aligned token window (unchanged)
     context_tokens = make_bar_aligned_context(
+        tokens, bpm=bpm, fps=int(mrt.codec.frame_rate),
+        ctx_frames=mrt.config.context_length_frames, beats_per_bar=beats_per_bar
     )
     state = mrt.init_state()
     state.context_tokens = context_tokens
+    # STYLE embed (optional: switch to loop_for_context if you want stronger “recent” bias)
+    loop_embed = mrt.embed_style(loop_for_context)
+    embeds, weights = [loop_embed], [float(loop_weight)]
     if extra_styles:
         for i, s in enumerate(extra_styles):
             if s.strip():
                 embeds.append(mrt.embed_style(s.strip()))
                 w = style_weights[i] if (style_weights and i < len(style_weights)) else 1.0
                 weights.append(float(w))
+    wsum = float(sum(weights)) or 1.0
     weights = [w / wsum for w in weights]
     combined_style = np.sum([w * e for w, e in zip(weights, embeds)], axis=0).astype(loop_embed.dtype)
+    # --- Length math ---
     seconds_per_bar = beats_per_bar * (60.0 / bpm)
+    total_secs      = bars * seconds_per_bar
+    drop_bars       = max(0, int(intro_bars_to_drop))
+    drop_secs       = min(drop_bars, bars) * seconds_per_bar       # clamp to <= bars
+    gen_total_secs  = total_secs + drop_secs                       # generate extra
+    # Chunk scheduling to cover gen_total_secs
     chunk_secs = mrt.config.chunk_length_frames * mrt.config.frame_length_samples / mrt.sample_rate  # ~2.0
+    steps = int(math.ceil(gen_total_secs / chunk_secs)) + 1  # pad then trim
     # Generate
     chunks = []
     for _ in range(steps):
+        wav, state = mrt.generate_chunk(state=state, style=combined_style)
         chunks.append(wav)
+    # Stitch continuous audio
+    stitched = stitch_generated(chunks, mrt.sample_rate, mrt.config.crossfade_length).as_stereo()
+    # Trim to generated length (bars + dropped bars)
+    stitched = hard_trim_seconds(stitched, gen_total_secs)
+    # 👉 Drop the intro bars
+    if drop_secs > 0:
+        n_drop = int(round(drop_secs * stitched.sample_rate))
+        stitched = au.Waveform(stitched.samples[n_drop:], stitched.sample_rate)
+    # Final exact-length trim to requested bars
+    out = hard_trim_seconds(stitched, total_secs)
+    # Final polish AFTER drop
+    out = out.peak_normalize(0.95)
     apply_micro_fades(out, 5)
+    # Loudness match to input (after drop) so bar 1 sits right
     out, loud_stats = match_loudness_to_reference(
         ref=loop, target=out,
+        method=loudness_mode, headroom_db=loudness_headroom_db
     )
     return out, loud_stats
 # ----------------------------
     guidance_weight: float = Form(5.0),
     temperature: float = Form(1.1),
     topk: int = Form(40),
+    target_sample_rate: int | None = Form(None),
+    intro_bars_to_drop: int = Form(0),          # <— NEW
 ):
     # Read file
     data = loop_audio.file.read()
             loop_weight=loop_weight,
             loudness_mode=loudness_mode,
             loudness_headroom_db=loudness_headroom_db,
+            intro_bars_to_drop=intro_bars_to_drop,   # <— pass through
         )
     # 1) Figure out the desired SR