WorkflowMusicDownload_Labeling.py

#  Step 1 - Download song
# Download music and get metadata genre,artist for label, from provider:
- `scdl`   for Spotify:    https://pastebin.com/8JNTHRHc
- `spotdl` for Soundcloud: https://github.com/scdl-org/scdl

# Optional if each folder name correspond to a playlist e.g."chill" then append categories for label:
```python
import os # Append prefix tags into each label.txt using folder name.
def modify_txt_files(folder_path):
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            with open(os.path.join(folder_path, filename), 'r+', encoding='utf-8', errors='ignore') as f:
                content, prefix = f.read(), f"in mood for {folder_path}, "
                if prefix not in content: f.seek(0), f.write(prefix + content)
if __name__ == "__main__": modify_txt_files("chill")
```


#  Step 2 - more descriptive Label music.mp3

# Option 1 using one of the huggingface model tag=audio-classification, for model "laion/larger_clap_music_and_speech" we can use https://github.com/lyramakesmusic/clap-interrogator

# Option 2 using `Multimodal LLM` like AudioQween usage https://pastebin.com/cPXEUrxf (or label music: https://pastebin.com/G1BB1yTg)

# however after comparing quality, moved to gemini
# gemini_LabelClassification.py
import os, base64, requests, json, time
from termcolor import colored
from tqdm import tqdm

API_KEYS = [
    os.environ.get("GOOGLE_API_KEY"),  # from environment variable
    #"AI.API-Key2", # If encountering free tier rate limit.
    #"AI.API-Key3",
    #"AI.API-Key4",
]

BASE_URL = "https://generativelanguage.googleapis.com/v1beta"
HEADERS, PROCESSED_FILES = {"Content-Type": "application/json"}, "processed_files.json"
MODEL = "gemini-1.5-pro-002"
api_key_index = 0  # Start with the first key

def get_next_api_key():
    global api_key_index
    api_key_index = (api_key_index + 1) % len(API_KEYS)  # Only increment if error occurs
    time.sleep(180)
    return API_KEYS[api_key_index]

def generate_content_with_audio(prompt, audio_file_path):
    global api_key_index
    while True:  # Keep retrying indefinitely until successful
        api_key = API_KEYS[api_key_index]
        url = f"{BASE_URL}/models/{MODEL}:generateContent?key={api_key}"

        # Read the audio file and encode it in base64
        with open(audio_file_path, "rb") as audio_file:
            audio_content = base64.b64encode(audio_file.read()).decode('utf-8')

        # Construct the request payload
        payload = {
            "contents": [{"parts": [{"text": prompt}, {"inlineData": {"mimeType": "audio/mp3", "data": audio_content}}]}],
            "generationConfig": {
                "temperature": 0.2,
                "topP": 0.85,
                "topK": 40,
                "maxOutputTokens": 1000,
            },
            "safetySettings": [
                {"category": cat, "threshold": "BLOCK_NONE"} for cat in [
                    "HARM_CATEGORY_HARASSMENT", "HARM_CATEGORY_HATE_SPEECH",
                    "HARM_CATEGORY_SEXUALLY_EXPLICIT", "HARM_CATEGORY_DANGEROUS_CONTENT"
                ]
            ]
        }

        # Make the API request
        tqdm.write(colored(f"Prompt: {prompt}", 'blue'))
        response = requests.post(url, headers=HEADERS, json=payload)

        # Handle a successful response
        if response.status_code == 200:
            result = response.json()['candidates'][0]['content']['parts'][0]['text'].strip()
            tqdm.write(colored(f"Response: {result}", 'green'))
            return result

        # On any error, switch to the next API key and retry
        else:
            tqdm.write(colored(f"Error: {response.status_code}, {response.text}", 'red'))
            tqdm.write(colored(f"Switching in 3 min to next API Key: {api_key[:10]}...", 'yellow'))
            api_key = get_next_api_key()


def append_at_prefix(file_path, content):
    with open(file_path, 'r+', encoding='utf-8') as f:
        existing_content = f.read().strip()
        f.seek(0, 0)
        f.write(f"{content} {existing_content}" if existing_content else content)
        f.truncate()

def load_processed_files():
    return json.load(open(PROCESSED_FILES, 'r')) if os.path.exists(PROCESSED_FILES) else {}

def save_processed_files(processed):
    json.dump(processed, open(PROCESSED_FILES, 'w'))

def main():
    folder_path = "myfolder" # gemini supports audio files of <20MB
    prompt = """You are the CLAP Interrogator. Provide a concise, 25-word description of the audio, including genre, mood, instruments, and notable nuance features and information to label for this music track."""
    processed_files = load_processed_files()
    mp3_files = [f for f in os.listdir(folder_path) if f.endswith(".mp3")]
    for filename in tqdm(mp3_files, desc="Processing", unit="file"):
        if filename in processed_files: continue
        mp3_path, txt_path = os.path.join(folder_path, filename), os.path.join(folder_path, filename[:-4] + ".txt")
        tqdm.write(f"Processing '{filename[:-4]}.txt'")
        if not os.path.exists(txt_path): open(txt_path, 'w', encoding='utf-8').close()
        while True:
            label = generate_content_with_audio(prompt, mp3_path)
            if label:
                append_at_prefix(txt_path, label)
                processed_files[filename] = True
                save_processed_files(processed_files)
                break
            else:
                tqdm.write(colored(f"Error occurred. Retrying in 3 minutes...", 'yellow'))
                time.sleep(180)

if __name__ == "__main__":
    main()


# Step 3 Optional. Append key+BPM in .txt from `librosa`.
# keyBPMlabel.py:
import os
import numpy as np
import librosa # Fix: pip install "numpy>=1.22.4,<2.3.0" librosa
from tqdm import tqdm

def extract_key_and_bpm(file_path):
    try:
        y, sr = librosa.load(file_path, sr=None)  # Load at original sample rate
        # Extract features relevant for music generation
        tempo, _ = librosa.beat.beat_track(y=y, sr=sr)
        chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr)
        if isinstance(tempo, np.ndarray): # Ensure tempo is a single value
            tempo = tempo.mean()
        if np.isnan(tempo) or np.isinf(tempo) or tempo <= 0: # Check for invalid tempo values
            return None

        key_idx = np.argmax(np.mean(chroma_stft, axis=1))
        key = librosa.hz_to_note(440 * 2**((key_idx - 69)/12))
        if np.isnan(key_idx) or np.isinf(key_idx): # Check for invalid key values
            return None
        return {
            'tempo': int(np.round(tempo)),  # Use np.round for numpy arrays and floats
            'key': key
        }
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None

def process_files(folder_path):
    mp3_files = [f for f in os.listdir(folder_path) if f.endswith('.mp3')]
    for filename in tqdm(mp3_files, desc="Processing Files", unit="file"):
        mp3_file = os.path.join(folder_path, filename)
        txt_file = os.path.join(folder_path, filename.replace('.mp3', '.txt'))
        audio_info = extract_key_and_bpm(mp3_file) # Extract audio information

        if audio_info:
            if os.path.exists(txt_file):
                with open(txt_file, 'r', encoding='utf-8') as file:
                    content = file.read().strip()
                # Append key and BPM information
                new_content = f"{content} at tempo {audio_info['tempo']} BPM in the key of {audio_info['key']}"

                # Write the new content back to the .txt file
                with open(txt_file, 'w', encoding='utf-8') as file:
                    file.write(new_content)
            else:
                # If the text file does not exist, create it with the extracted information
                with open(txt_file, 'w', encoding='utf-8') as file:
                    file.write(f"Tempo: {audio_info['tempo']} BPM, Key: {audio_info['key']}")

folder_path = r"C:\path\to\myfolder"
process_files(folder_path)


# Step 4 Optional. Deduplicated https://pastebin.com/04LgbNsR