Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

About sampling rate #60

Open
ootsuka-repos opened this issue Jan 28, 2025 · 1 comment
Open

About sampling rate #60

ootsuka-repos opened this issue Jan 28, 2025 · 1 comment

Comments

@ootsuka-repos
Copy link

When preparing the training data, is there a problem with 44100hz when converting to a byte sequence?

import os
import pandas as pd
import soundfile as sf
import io
from tqdm import tqdm
from scipy.signal import resample

def convert_audio_to_wav_bytes(audio_path: str, target_samplerate: int = 44100) -> bytes:
    """音声ファイルをWAV形式のバイト列に変換(指定されたサンプルレートにリサンプリング)"""
    # 音声ファイルを読み込む
    audio_data, samplerate = sf.read(audio_path)
    
    # サンプルレートが異なる場合はリサンプリング
    if samplerate != target_samplerate:
        num_samples = int(len(audio_data) * target_samplerate / samplerate)
        audio_data = resample(audio_data, num_samples)
        samplerate = target_samplerate
    
    # モノラルに変換
    if len(audio_data.shape) > 1:
        audio_data = audio_data.mean(axis=1)
    
    # WAV形式でバイト列に変換
    buffer = io.BytesIO()
    sf.write(buffer, audio_data, samplerate, format='WAV', subtype='PCM_16')
    return buffer.getvalue()

def create_sample(audio_path: str, transcript: str) -> dict:
    """音声ファイルとテキストからデータサンプルを生成"""
    try:
        wav_bytes = convert_audio_to_wav_bytes(audio_path)
        return {
            "transcript": transcript,
            "audio": {
                "bytes": wav_bytes  # バイト列そのものを返す
            }
        }
    except Exception as e:
        print(f"エラーが発生しました: {audio_path} - {str(e)}")
        return None

def process_csv(csv_path: str, output_dir: str):
    """CSVファイルを処理してデータサンプルを生成"""
    # CSVファイルを読み込む
    df = pd.read_csv(csv_path)
    
    # 出力ディレクトリを作成
    os.makedirs(output_dir, exist_ok=True)
    
    # データを処理
    samples = []
    for _, row in tqdm(df.iterrows(), total=len(df)):
        audio_path = row['FilePath']
        transcript = row['Text']
        
        # データサンプルを生成
        sample = create_sample(audio_path, transcript)
        if sample:
            samples.append(sample)
    
    # Parquet形式で保存
    output_path = os.path.join(output_dir, "dataset.parquet")
    samples_df = pd.DataFrame(samples)
    samples_df.to_parquet(output_path, index=False)
    
    print(f"データセットがParquet形式で保存されました: {output_path}")

# メイン処理
if __name__ == "__main__":
    # 入力CSVファイルのパス
    csv_path = "/home/nidera515/OuteTTS/out.csv"
    
    # 出力ディレクトリ
    output_dir = "/home/nidera515/OuteTTS/output_dataset"
    
    # CSVを処理してデータセットを生成
    process_csv(csv_path, output_dir)
@edwko
Copy link
Owner

edwko commented Jan 30, 2025

Should be fine, the interface automatically resamples to the required sample rate, in this case, v0.3 uses 24kHz.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants