86 lines
2.4 KiB
Text
86 lines
2.4 KiB
Text
|
#!/Users/sij/miniforge3/bin/python
|
||
|
|
||
|
import sys
|
||
|
import os
|
||
|
import tempfile
|
||
|
from pathlib import Path
|
||
|
import uuid
|
||
|
import hashlib
|
||
|
from pydub import AudioSegment
|
||
|
import torch
|
||
|
from TTS.api import TTS # Adjust with actual import
|
||
|
from playsound import playsound
|
||
|
|
||
|
from TTS.api import TTS
|
||
|
|
||
|
device = torch.device('cpu') # keep trying 'mps' it will eventually be implemented
|
||
|
model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
|
||
|
tts = TTS(model_name=model_name).to(device)
|
||
|
DEFAULT_VOICE = "kiel"
|
||
|
|
||
|
def select_voice(voice_name: str) -> str:
|
||
|
voice_dir = Path('/Users/sij/AI/banana-phone/voices')
|
||
|
voice_file = voice_dir / f"{voice_name}.wav"
|
||
|
if voice_file.is_file():
|
||
|
return str(voice_file)
|
||
|
else:
|
||
|
print(f"Voice file not found for {voice_name}, using default")
|
||
|
return str(voice_dir / f"{DEFAULT_VOICE}.wav")
|
||
|
|
||
|
def generate_speech(text, speed, voice_file):
|
||
|
output_dir = Path(tempfile.gettempdir())
|
||
|
output_dir.mkdir(exist_ok=True)
|
||
|
|
||
|
short_uuid = str(uuid.uuid4())[:8]
|
||
|
output_file_name = f"{Path(voice_file).stem}-{short_uuid}.wav"
|
||
|
output_file = output_dir / output_file_name
|
||
|
|
||
|
tts.tts_to_file(
|
||
|
text=text,
|
||
|
speed=speed,
|
||
|
file_path=output_file,
|
||
|
speaker_wav=[voice_file],
|
||
|
language="en"
|
||
|
)
|
||
|
|
||
|
return output_file
|
||
|
|
||
|
def main():
|
||
|
if len(sys.argv) < 2:
|
||
|
print("Usage: python script.py <text/file> [voice] [speed]")
|
||
|
sys.exit(1)
|
||
|
|
||
|
text_input = sys.argv[1]
|
||
|
if len(text_input) < 255 and os.path.isfile(text_input):
|
||
|
with open(text_input, 'r') as file:
|
||
|
text = file.read()
|
||
|
else:
|
||
|
text = text_input
|
||
|
|
||
|
voice = sys.argv[2] if len(sys.argv) > 2 else DEFAULT_VOICE
|
||
|
speed = float(sys.argv[3]) if len(sys.argv) > 3 else 1.1
|
||
|
|
||
|
voice_file_path = select_voice(voice)
|
||
|
|
||
|
print(f"Using voice file at {voice_file_path}")
|
||
|
|
||
|
combined_audio = AudioSegment.silent(duration=0)
|
||
|
output_file = generate_speech(text, speed, voice_file_path)
|
||
|
combined_audio += AudioSegment.from_wav(str(output_file))
|
||
|
|
||
|
# Exporting combined audio
|
||
|
final_output_path = Path(tempfile.gettempdir()) / "output.wav"
|
||
|
combined_audio.export(str(final_output_path), format="wav")
|
||
|
|
||
|
# Now playing the generated speech file
|
||
|
print(f"Playing generated speech from {final_output_path}")
|
||
|
playsound(str(final_output_path))
|
||
|
|
||
|
# Cleanup
|
||
|
os.remove(output_file)
|
||
|
os.remove(final_output_path)
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
main()
|
||
|
|