diff --git a/pyproject.toml b/pyproject.toml index 63a50fac..42adf209 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -75,6 +75,7 @@ dependencies = [ "tzdata == 2023.3", "rapidocr-onnxruntime == 1.3.8", "stripe == 7.3.0", + "openai-whisper >= 20231117", ] dynamic = ["version"] diff --git a/src/khoj/database/migrations/0021_speechtotextmodeloptions_and_more.py b/src/khoj/database/migrations/0021_speechtotextmodeloptions_and_more.py index c3e3c41d..37337791 100644 --- a/src/khoj/database/migrations/0021_speechtotextmodeloptions_and_more.py +++ b/src/khoj/database/migrations/0021_speechtotextmodeloptions_and_more.py @@ -1,4 +1,4 @@ -# Generated by Django 4.2.7 on 2023-11-26 09:37 +# Generated by Django 4.2.7 on 2023-11-26 13:54 from django.db import migrations, models @@ -15,11 +15,11 @@ class Migration(migrations.Migration): ("id", models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")), ("created_at", models.DateTimeField(auto_now_add=True)), ("updated_at", models.DateTimeField(auto_now=True)), - ("model_name", models.CharField(default="whisper-1", max_length=200)), + ("model_name", models.CharField(default="base", max_length=200)), ( "model_type", models.CharField( - choices=[("openai", "Openai"), ("offline", "Offline")], default="openai", max_length=200 + choices=[("openai", "Openai"), ("offline", "Offline")], default="offline", max_length=200 ), ), ], diff --git a/src/khoj/database/models/__init__.py b/src/khoj/database/models/__init__.py index 77478ef5..82348fbe 100644 --- a/src/khoj/database/models/__init__.py +++ b/src/khoj/database/models/__init__.py @@ -125,8 +125,8 @@ class SpeechToTextModelOptions(BaseModel): OPENAI = "openai" OFFLINE = "offline" - model_name = models.CharField(max_length=200, default="whisper-1") - model_type = models.CharField(max_length=200, choices=ModelType.choices, default=ModelType.OPENAI) + model_name = models.CharField(max_length=200, default="base") + model_type = models.CharField(max_length=200, choices=ModelType.choices, default=ModelType.OFFLINE) class ChatModelOptions(BaseModel): diff --git a/src/khoj/processor/conversation/offline/whisper.py b/src/khoj/processor/conversation/offline/whisper.py new file mode 100644 index 00000000..d22486a9 --- /dev/null +++ b/src/khoj/processor/conversation/offline/whisper.py @@ -0,0 +1,17 @@ +# External Packages +from asgiref.sync import sync_to_async +import whisper + +# Internal Packages +from khoj.utils import state + + +async def transcribe_audio_offline(audio_filename: str, model: str) -> str | None: + """ + Transcribe audio file offline using Whisper + """ + # Send the audio data to the Whisper API + if not state.whisper_model: + state.whisper_model = whisper.load_model(model) + response = await sync_to_async(state.whisper_model.transcribe)(audio_filename) + return response["text"] diff --git a/src/khoj/routers/api.py b/src/khoj/routers/api.py index 67b959a7..9f1b118e 100644 --- a/src/khoj/routers/api.py +++ b/src/khoj/routers/api.py @@ -31,6 +31,7 @@ from khoj.database.models import ( NotionConfig, ) from khoj.processor.conversation.offline.chat_model import extract_questions_offline +from khoj.processor.conversation.offline.whisper import transcribe_audio_offline from khoj.processor.conversation.openai.gpt import extract_questions from khoj.processor.conversation.openai.whisper import transcribe_audio from khoj.processor.conversation.prompts import help_message, no_entries_found @@ -605,13 +606,16 @@ async def transcribe(request: Request, common: CommonQueryParams, file: UploadFi # Send the audio data to the Whisper API speech_to_text_config = await ConversationAdapters.get_speech_to_text_config() openai_chat_config = await ConversationAdapters.get_openai_chat_config() - if not openai_chat_config or not speech_to_text_config: + if not speech_to_text_config: # If the user has not configured a speech to text model, return an unprocessable entity error status_code = 422 - elif speech_to_text_config.model_type == ChatModelOptions.ModelType.OPENAI: + elif openai_chat_config and speech_to_text_config.model_type == ChatModelOptions.ModelType.OPENAI: api_key = openai_chat_config.api_key speech2text_model = speech_to_text_config.model_name - user_message = await transcribe_audio(model=speech2text_model, audio_file=audio_file, api_key=api_key) + user_message = await transcribe_audio(audio_file, model=speech2text_model, api_key=api_key) + elif speech_to_text_config.model_type == ChatModelOptions.ModelType.OFFLINE: + speech2text_model = speech_to_text_config.model_name + user_message = await transcribe_audio_offline(audio_filename, model=speech2text_model) finally: # Close and Delete the temporary audio file audio_file.close() diff --git a/src/khoj/utils/initialization.py b/src/khoj/utils/initialization.py index ee0454c4..313b18fc 100644 --- a/src/khoj/utils/initialization.py +++ b/src/khoj/utils/initialization.py @@ -74,10 +74,9 @@ def initialization(): except ModuleNotFoundError as e: logger.warning("Offline models are not supported on this device.") - use_openai_model = input("Use OpenAI chat model? (y/n): ") - + use_openai_model = input("Use OpenAI models? (y/n): ") if use_openai_model == "y": - logger.info("🗣️ Setting up OpenAI chat model") + logger.info("🗣️ Setting up your OpenAI configuration") api_key = input("Enter your OpenAI API key: ") OpenAIProcessorConversationConfig.objects.create(api_key=api_key) @@ -104,7 +103,25 @@ def initialization(): model_name=openai_speech2text_model, model_type=SpeechToTextModelOptions.ModelType.OPENAI ) - logger.info("🗣️ Chat model configuration complete") + if use_offline_model == "y" or use_openai_model == "y": + logger.info("🗣️ Chat model configuration complete") + + use_offline_speech2text_model = input("Use offline speech to text model? (y/n): ") + if use_offline_speech2text_model == "y": + logger.info("🗣️ Setting up offline speech to text model") + # Delete any existing speech to text model options. There can only be one. + SpeechToTextModelOptions.objects.all().delete() + + default_offline_speech2text_model = "base" + offline_speech2text_model = input( + f"Enter the Whisper model to use Offline (default: {default_offline_speech2text_model}): " + ) + offline_speech2text_model = offline_speech2text_model or default_offline_speech2text_model + SpeechToTextModelOptions.objects.create( + model_name=offline_speech2text_model, model_type=SpeechToTextModelOptions.ModelType.OFFLINE + ) + + logger.info(f"🗣️ Offline speech to text model configured to {offline_speech2text_model}") admin_user = KhojUser.objects.filter(is_staff=True).first() if admin_user is None: diff --git a/src/khoj/utils/state.py b/src/khoj/utils/state.py index 91f5f0ce..ce4d5804 100644 --- a/src/khoj/utils/state.py +++ b/src/khoj/utils/state.py @@ -21,6 +21,7 @@ embeddings_model: EmbeddingsModel = None cross_encoder_model: CrossEncoderModel = None content_index = ContentIndex() gpt4all_processor_config: GPT4AllProcessorModel = None +whisper_model = None config_file: Path = None verbose: int = 0 host: str = None