1
0
mirror of https://github.com/ijaric/voice_assistant.git synced 2025-05-24 14:33:26 +00:00

Merge branch 'main' into tasks/#41_assistant_base_tests

This commit is contained in:
Artem Litvinov 2023-10-12 16:21:50 +01:00 committed by GitHub
commit fed97f81ad
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
31 changed files with 740 additions and 1577 deletions

View File

@ -5,6 +5,12 @@ POSTGRES_USER=user
POSTGRES_PASSWORD=Qwe123 POSTGRES_PASSWORD=Qwe123
POSTGRES_DB_NAME=api_db POSTGRES_DB_NAME=api_db
PROXY_HOST=255.255.255.255
PROXY_PORT=8888
PROXY_USER=YOUR_USER
PROXY_PASSWORD=YOUR_PASSWORD
PROXY_ENABLE=False
NGINX_PORT=80 NGINX_PORT=80
API_HOST=0.0.0.0 API_HOST=0.0.0.0
API_PORT=8000 API_PORT=8000
@ -17,3 +23,10 @@ JWT_SECRET_KEY=v9LctjUWwol4XbvczPiLFMDtZ8aal7mm
JWT_ALGORITHM=HS256 JWT_ALGORITHM=HS256
APP_RELOAD=True APP_RELOAD=True
VOICE_AVAILABLE_FORMATS=mp3,ogg,wav
VOICE_MAX_INPUT_SIZE=5120 # 5MB
VOICE_MAX_INPUT_SECONDS=30
OPENAI_API_KEY=sk-1234567890
OPENAI_STT_MODEL=whisper-1

View File

@ -11,6 +11,7 @@ import lib.app.errors as app_errors
import lib.app.settings as app_settings import lib.app.settings as app_settings
import lib.app.split_settings as app_split_settings import lib.app.split_settings as app_split_settings
import lib.clients as clients import lib.clients as clients
import lib.stt as stt
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -57,9 +58,21 @@ class Application:
logger.info("Initializing clients") logger.info("Initializing clients")
http_yandex_tts_client = clients.AsyncHttpClient(
base_url="yandex", # todo add yandex api url from settings
proxy_settings=settings.proxy,
)
disposable_resources.append(
DisposableResource(
name="http_client yandex",
dispose_callback=http_yandex_tts_client.close(),
)
)
# Repositories # Repositories
logger.info("Initializing repositories") logger.info("Initializing repositories")
stt_repository: stt.STTProtocol = stt.OpenaiSpeechRepository(settings=settings)
# Caches # Caches
@ -68,6 +81,7 @@ class Application:
# Services # Services
logger.info("Initializing services") logger.info("Initializing services")
stt_service: stt.SpeechService = stt.SpeechService(repository=stt_repository) # type: ignore
# Handlers # Handlers

View File

@ -13,6 +13,12 @@ class Settings(pydantic_settings.BaseSettings):
logger: app_split_settings.LoggingSettings = pydantic.Field( logger: app_split_settings.LoggingSettings = pydantic.Field(
default_factory=lambda: app_split_settings.LoggingSettings() default_factory=lambda: app_split_settings.LoggingSettings()
) )
openai: app_split_settings.OpenaiSettings = pydantic.Field(
default_factory=lambda: app_split_settings.OpenaiSettings()
)
project: app_split_settings.ProjectSettings = pydantic.Field( project: app_split_settings.ProjectSettings = pydantic.Field(
default_factory=lambda: app_split_settings.ProjectSettings() default_factory=lambda: app_split_settings.ProjectSettings()
) )
proxy: app_split_settings.ProxySettings = pydantic.Field(default_factory=lambda: app_split_settings.ProxySettings())
voice: app_split_settings.VoiceSettings = pydantic.Field(default_factory=lambda: app_split_settings.VoiceSettings())

View File

@ -1,14 +1,20 @@
from .api import * from .api import *
from .app import * from .app import *
from .logger import * from .logger import *
from .openai import *
from .postgres import * from .postgres import *
from .project import * from .project import *
from .proxy import *
from .voice import *
__all__ = [ __all__ = [
"ApiSettings", "ApiSettings",
"AppSettings", "AppSettings",
"LoggingSettings", "LoggingSettings",
"OpenaiSettings",
"PostgresSettings", "PostgresSettings",
"ProjectSettings", "ProjectSettings",
"ProxySettings",
"VoiceSettings",
"get_logging_config", "get_logging_config",
] ]

View File

@ -5,7 +5,9 @@ import lib.app.split_settings.utils as app_split_settings_utils
class LoggingSettings(pydantic_settings.BaseSettings): class LoggingSettings(pydantic_settings.BaseSettings):
model_config = pydantic_settings.SettingsConfigDict( model_config = pydantic_settings.SettingsConfigDict(
env_file=app_split_settings_utils.ENV_PATH, env_file_encoding="utf-8", extra="ignore" env_file=app_split_settings_utils.ENV_PATH,
env_file_encoding="utf-8",
extra="ignore",
) )
log_format: str = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" log_format: str = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"

View File

@ -0,0 +1,18 @@
import pydantic
import pydantic_settings
import lib.app.split_settings.utils as app_split_settings_utils
class OpenaiSettings(pydantic_settings.BaseSettings):
model_config = pydantic_settings.SettingsConfigDict(
env_file=app_split_settings_utils.ENV_PATH,
env_prefix="OPENAI_",
env_file_encoding="utf-8",
extra="ignore",
)
api_key: pydantic.SecretStr = pydantic.Field(
default=..., validation_alias=pydantic.AliasChoices("api_key", "openai_api_key")
)
stt_model: str = "whisper-1"

View File

@ -0,0 +1,43 @@
import typing
import pydantic
import pydantic_settings
import lib.app.split_settings.utils as app_split_settings_utils
class ProxySettings(pydantic_settings.BaseSettings):
model_config = pydantic_settings.SettingsConfigDict(
env_file=app_split_settings_utils.ENV_PATH,
env_prefix="PROXY_",
env_file_encoding="utf-8",
extra="ignore",
)
protocol: typing.Literal["http", "socks5"] = "http"
user: str | None = None
password: pydantic.SecretStr | None = None
host: str | None = None
port: int | None = None
enable: bool = False
@property
def dsn(self) -> str:
if self.user and self.password:
password = self.password.get_secret_value()
return f"{self.protocol}://{self.user}:{password}@{self.host}:{self.port}"
return f"{self.protocol}://{self.host}:{self.port}"
@pydantic.computed_field
@property
def dsn_as_safe_url(self) -> str:
if self.user and self.password:
return f"{self.protocol}://{self.user}:{self.password}@{self.host}:{self.port}"
return f"{self.protocol}://{self.host}:{self.port}"
@pydantic.model_validator(mode="after")
def check_proxy(self):
if not self.enable:
return self
if self.host and self.port:
return self
raise ValueError("Proxy settings must be set if use_proxy is True")

View File

@ -0,0 +1,21 @@
import pydantic
import pydantic_settings
import lib.app.split_settings.utils as app_split_settings_utils
class VoiceSettings(pydantic_settings.BaseSettings):
model_config = pydantic_settings.SettingsConfigDict(
env_file=app_split_settings_utils.ENV_PATH,
env_prefix="VOICE_",
env_file_encoding="utf-8",
extra="ignore",
)
max_input_seconds: int = 30
max_input_size: int = 5120 # 5MB
available_formats: str = "wav,mp3,ogg"
@pydantic.field_validator("available_formats")
def validate_available_formats(cls, v: str) -> list[str]:
return v.split(",")

View File

@ -1,3 +1,7 @@
from .http_client import AsyncHttpClient
from .postgres import AsyncPostgresClient from .postgres import AsyncPostgresClient
__all__ = ["AsyncPostgresClient"] __all__ = [
"AsyncHttpClient",
"AsyncPostgresClient",
]

View File

@ -0,0 +1,29 @@
import typing
import httpx
import lib.app.split_settings as app_split_settings
class AsyncHttpClient(httpx.AsyncClient):
def __init__(
self,
proxy_settings: app_split_settings.ProxySettings,
base_url: str | None = None,
**client_params: typing.Any,
) -> None:
self.base_url = base_url if base_url else ""
self.proxy_settings = proxy_settings
self.proxies = self.__get_proxies_from_settings()
self.client_params = client_params
super().__init__(base_url=self.base_url, proxies=self.proxies, **client_params) # type: ignore[reportGeneralTypeIssues]
def __get_proxies_from_settings(self) -> dict[str, str] | None:
if not self.proxy_settings.enable:
return None
proxies = {"all://": self.proxy_settings.dsn}
return proxies
async def close(self) -> None:
await self.aclose()

View File

@ -1,4 +1,20 @@
from .orm import Base, IdCreatedUpdatedBaseMixin from .orm import Base, IdCreatedUpdatedBaseMixin
from .token import Token from .token import Token
from .tts import *
__all__ = ["Base", "IdCreatedUpdatedBaseMixin", "Token"] __all__ = [
"AVAILABLE_MODELS_TYPE",
"Base",
"BaseLanguageCodesEnum",
"BaseVoiceModel",
"ElevenLabsLanguageCodesEnum",
"IdCreatedUpdatedBaseMixin",
"LANGUAGE_CODES_ENUM_TYPE",
"LIST_VOICE_MODELS_TYPE",
"TTSCreateRequestModel",
"TTSCreateResponseModel",
"TTSSearchVoiceRequestModel",
"Token",
"VoiceModelProvidersEnum",
"YandexLanguageCodesEnum",
]

View File

@ -0,0 +1,16 @@
from .models import *
from .voice import *
__all__ = [
"AVAILABLE_MODELS_TYPE",
"BaseLanguageCodesEnum",
"BaseVoiceModel",
"ElevenLabsLanguageCodesEnum",
"LANGUAGE_CODES_ENUM_TYPE",
"LIST_VOICE_MODELS_TYPE",
"TTSCreateRequestModel",
"TTSCreateResponseModel",
"TTSSearchVoiceRequestModel",
"VoiceModelProvidersEnum",
"YandexLanguageCodesEnum",
]

View File

@ -0,0 +1,31 @@
import pydantic
import lib.models.tts.voice as models_tts_voice
import lib.models.tts.voice.languages as models_tts_languages
AVAILABLE_MODELS_TYPE = models_tts_voice.YandexVoiceModel | models_tts_voice.ElevenLabsVoiceModel
LIST_VOICE_MODELS_TYPE = models_tts_voice.YandexListVoiceModelsModel | models_tts_voice.ElevenLabsListVoiceModelsModel
class TTSCreateRequestModel(pydantic.BaseModel):
model_config = pydantic.ConfigDict(use_enum_values=True)
voice_model: AVAILABLE_MODELS_TYPE
text: str
class TTSCreateResponseModel(pydantic.BaseModel):
audio_content: bytes
class TTSSearchVoiceRequestModel(pydantic.BaseModel):
voice_id: str | None = None
voice_name: str | None = None
languages: list[models_tts_languages.LANGUAGE_CODES_ENUM_TYPE] | None = None
company_name: str | None = None
@pydantic.model_validator(mode="after")
def check_at_least_one_field(self):
if not any((self.voice_name, self.languages, self.company_name)):
raise ValueError("At least one field required")
return self

View File

@ -0,0 +1,17 @@
from .base import *
from .eleven_labs import *
from .languages import *
from .yandex import *
__all__ = [
"BaseLanguageCodesEnum",
"BaseVoiceModel",
"ElevenLabsLanguageCodesEnum",
"ElevenLabsListVoiceModelsModel",
"ElevenLabsVoiceModel",
"LANGUAGE_CODES_ENUM_TYPE",
"VoiceModelProvidersEnum",
"YandexLanguageCodesEnum",
"YandexListVoiceModelsModel",
"YandexVoiceModel",
]

View File

@ -0,0 +1,27 @@
import enum
import typing
import pydantic
import lib.models.tts.voice.languages as models_tts_languages
class VoiceModelProvidersEnum(enum.Enum):
YANDEX = "yandex"
ELEVEN_LABS = "eleven_labs"
class BaseVoiceModel(pydantic.BaseModel):
voice_id: str
voice_name: str | None = None
languages: list[models_tts_languages.LANGUAGE_CODES_ENUM_TYPE]
provider: VoiceModelProvidersEnum
@pydantic.model_validator(mode="before")
@classmethod
def check_voice_name_exists(cls, data: typing.Any) -> typing.Any:
voice_id = data.get("voice_id")
voice_name = data.get("voice_name")
if not voice_name and voice_id:
data["voice_name"] = voice_id
return data

View File

@ -0,0 +1,75 @@
import typing
import pydantic
import lib.models.tts.voice.base as models_tts_base
import lib.models.tts.voice.languages as models_tts_languages
class ElevenLabsVoiceModel(models_tts_base.BaseVoiceModel):
model_config = pydantic.ConfigDict(use_enum_values=True)
voice_id: str
voice_name: str | None = None
languages: list[models_tts_languages.LANGUAGE_CODES_ENUM_TYPE]
provider: models_tts_base.VoiceModelProvidersEnum = models_tts_base.VoiceModelProvidersEnum.ELEVEN_LABS
class ElevenLabsListVoiceModelsModel(pydantic.BaseModel):
models: list[ElevenLabsVoiceModel] = [
ElevenLabsVoiceModel(
voice_id="eleven_multilingual_v1",
languages=[
models_tts_languages.ElevenLabsLanguageCodesEnum.ENGLISH,
models_tts_languages.ElevenLabsLanguageCodesEnum.GERMAN,
models_tts_languages.ElevenLabsLanguageCodesEnum.POLISH,
models_tts_languages.ElevenLabsLanguageCodesEnum.SPANISH,
models_tts_languages.ElevenLabsLanguageCodesEnum.ITALIAN,
models_tts_languages.ElevenLabsLanguageCodesEnum.FRENCH,
models_tts_languages.ElevenLabsLanguageCodesEnum.PORTUGUESE,
models_tts_languages.ElevenLabsLanguageCodesEnum.HINDI,
models_tts_languages.ElevenLabsLanguageCodesEnum.ARABIC,
],
),
ElevenLabsVoiceModel(
voice_id="eleven_multilingual_v2",
languages=[
models_tts_languages.ElevenLabsLanguageCodesEnum.ENGLISH,
models_tts_languages.ElevenLabsLanguageCodesEnum.JAPANESE,
models_tts_languages.ElevenLabsLanguageCodesEnum.CHINESE,
models_tts_languages.ElevenLabsLanguageCodesEnum.GERMAN,
models_tts_languages.ElevenLabsLanguageCodesEnum.HINDI,
models_tts_languages.ElevenLabsLanguageCodesEnum.FRENCH,
models_tts_languages.ElevenLabsLanguageCodesEnum.KOREAN,
models_tts_languages.ElevenLabsLanguageCodesEnum.PORTUGUESE,
models_tts_languages.ElevenLabsLanguageCodesEnum.ITALIAN,
models_tts_languages.ElevenLabsLanguageCodesEnum.SPANISH,
models_tts_languages.ElevenLabsLanguageCodesEnum.INDONESIAN,
models_tts_languages.ElevenLabsLanguageCodesEnum.DUTCH,
models_tts_languages.ElevenLabsLanguageCodesEnum.TURKISH,
models_tts_languages.ElevenLabsLanguageCodesEnum.FILIPINO,
models_tts_languages.ElevenLabsLanguageCodesEnum.POLISH,
models_tts_languages.ElevenLabsLanguageCodesEnum.SWEDISH,
models_tts_languages.ElevenLabsLanguageCodesEnum.BULGARIAN,
models_tts_languages.ElevenLabsLanguageCodesEnum.ROMANIAN,
models_tts_languages.ElevenLabsLanguageCodesEnum.ARABIC,
models_tts_languages.ElevenLabsLanguageCodesEnum.CZECH,
models_tts_languages.ElevenLabsLanguageCodesEnum.GREEK,
models_tts_languages.ElevenLabsLanguageCodesEnum.FINNISH,
models_tts_languages.ElevenLabsLanguageCodesEnum.CROATIAN,
models_tts_languages.ElevenLabsLanguageCodesEnum.MALAY,
models_tts_languages.ElevenLabsLanguageCodesEnum.SLOVAK,
models_tts_languages.ElevenLabsLanguageCodesEnum.DANISH,
models_tts_languages.ElevenLabsLanguageCodesEnum.TAMIL,
models_tts_languages.ElevenLabsLanguageCodesEnum.UKRAINIAN,
],
),
ElevenLabsVoiceModel(
voice_id="eleven_multilingual_v2",
languages=[models_tts_languages.ElevenLabsLanguageCodesEnum.ENGLISH],
),
]
@classmethod
def from_api(cls, voice_models_from_api: list[dict[str, typing.Any]]) -> typing.Self:
voice_models = [ElevenLabsVoiceModel.model_validate(voice_model) for voice_model in voice_models_from_api]
return ElevenLabsListVoiceModelsModel(models=voice_models)

View File

@ -0,0 +1,83 @@
import enum
class BaseLanguageCodesEnum(enum.Enum):
RUSSIAN = "ru"
ENGLISH = "en"
KAZAKH = "kk"
GERMAN = "de"
HEBREW = "he"
UZBEK = "uz"
JAPANESE = "ja"
CHINESE = "zh"
HINDI = "hi"
FRENCH = "fr"
KOREAN = "ko"
PORTUGUESE = "pt"
ITALIAN = "it"
SPANISH = "es"
INDONESIAN = "id"
DUTCH = "nl"
TURKISH = "tr"
FILIPINO = "fil"
POLISH = "pl"
SWEDISH = "sv"
BULGARIAN = "bg"
ROMANIAN = "ro"
ARABIC = "ar"
CZECH = "cs"
GREEK = "el"
FINNISH = "fi"
CROATIAN = "hr"
MALAY = "ms"
SLOVAK = "sk"
DANISH = "da"
TAMIL = "ta"
UKRAINIAN = "uk"
class ElevenLabsLanguageCodesEnum(enum.Enum):
RUSSIAN = "ru"
ENGLISH = "en"
KAZAKH = "kk"
GERMAN = "de"
HEBREW = "he"
UZBEK = "uz"
JAPANESE = "ja"
CHINESE = "zh"
HINDI = "hi"
FRENCH = "fr"
KOREAN = "ko"
PORTUGUESE = "pt"
ITALIAN = "it"
SPANISH = "es"
INDONESIAN = "id"
DUTCH = "nl"
TURKISH = "tr"
FILIPINO = "fil"
POLISH = "pl"
SWEDISH = "sv"
BULGARIAN = "bg"
ROMANIAN = "ro"
ARABIC = "ar"
CZECH = "cs"
GREEK = "el"
FINNISH = "fi"
CROATIAN = "hr"
MALAY = "ms"
SLOVAK = "sk"
DANISH = "da"
TAMIL = "ta"
UKRAINIAN = "uk"
class YandexLanguageCodesEnum(enum.Enum):
RUSSIAN = "ru-RU"
ENGLISH = "en-US"
KAZAKH = "kk-KK"
GERMAN = "de-DE"
HEBREW = "he-IL"
UZBEK = "uz-UZ"
LANGUAGE_CODES_ENUM_TYPE = BaseLanguageCodesEnum | ElevenLabsLanguageCodesEnum | YandexLanguageCodesEnum

View File

@ -0,0 +1,97 @@
import typing
import pydantic
import lib.models.tts.voice.base as models_tts_base
import lib.models.tts.voice.languages as models_tts_languages
class YandexVoiceModel(models_tts_base.BaseVoiceModel):
voice_id: str
voice_name: str | None = None
languages: list[models_tts_languages.LANGUAGE_CODES_ENUM_TYPE]
provider: models_tts_base.VoiceModelProvidersEnum = models_tts_base.VoiceModelProvidersEnum.YANDEX
role: str | None = None
@pydantic.model_validator(mode="before")
@classmethod
def check_voice_name_exists(cls, data: typing.Any) -> typing.Any:
voice_id = data.get("voice_id")
voice_name = data.get("voice_name")
role = data.get("role")
if not voice_name and voice_id:
data["voice_name"] = f"{voice_id} {role}" if role else voice_id
return data
class YandexListVoiceModelsModel(pydantic.BaseModel):
models: list[YandexVoiceModel] = [
YandexVoiceModel(
voice_id="ermil", role="neutral", languages=[models_tts_languages.YandexLanguageCodesEnum.RUSSIAN]
),
YandexVoiceModel(
voice_id="ermil", role="good", languages=[models_tts_languages.YandexLanguageCodesEnum.RUSSIAN]
),
YandexVoiceModel(
voice_id="alena", role="neutral", languages=[models_tts_languages.YandexLanguageCodesEnum.RUSSIAN]
),
YandexVoiceModel(
voice_id="alena", role="good", languages=[models_tts_languages.YandexLanguageCodesEnum.RUSSIAN]
),
YandexVoiceModel(
voice_id="jane", role="neutral", languages=[models_tts_languages.YandexLanguageCodesEnum.RUSSIAN]
),
YandexVoiceModel(
voice_id="jane", role="good", languages=[models_tts_languages.YandexLanguageCodesEnum.RUSSIAN]
),
YandexVoiceModel(
voice_id="jane", role="evil", languages=[models_tts_languages.YandexLanguageCodesEnum.RUSSIAN]
),
YandexVoiceModel(
voice_id="omazh", role="neutral", languages=[models_tts_languages.YandexLanguageCodesEnum.RUSSIAN]
),
YandexVoiceModel(
voice_id="omazh", role="evil", languages=[models_tts_languages.YandexLanguageCodesEnum.RUSSIAN]
),
YandexVoiceModel(
voice_id="zahar", role="neutral", languages=[models_tts_languages.YandexLanguageCodesEnum.RUSSIAN]
),
YandexVoiceModel(
voice_id="zahar", role="good", languages=[models_tts_languages.YandexLanguageCodesEnum.RUSSIAN]
),
YandexVoiceModel(
voice_id="filipp", role=None, languages=[models_tts_languages.YandexLanguageCodesEnum.RUSSIAN]
),
YandexVoiceModel(
voice_id="madirus", role=None, languages=[models_tts_languages.YandexLanguageCodesEnum.RUSSIAN]
),
YandexVoiceModel(voice_id="dasha", role=None, languages=[models_tts_languages.YandexLanguageCodesEnum.RUSSIAN]),
YandexVoiceModel(voice_id="julia", role=None, languages=[models_tts_languages.YandexLanguageCodesEnum.RUSSIAN]),
YandexVoiceModel(voice_id="lera", role=None, languages=[models_tts_languages.YandexLanguageCodesEnum.RUSSIAN]),
YandexVoiceModel(
voice_id="marina", role=None, languages=[models_tts_languages.YandexLanguageCodesEnum.RUSSIAN]
),
YandexVoiceModel(
voice_id="alexander", role=None, languages=[models_tts_languages.YandexLanguageCodesEnum.RUSSIAN]
),
YandexVoiceModel(
voice_id="kirill", role=None, languages=[models_tts_languages.YandexLanguageCodesEnum.RUSSIAN]
),
YandexVoiceModel(voice_id="anton", role=None, languages=[models_tts_languages.YandexLanguageCodesEnum.RUSSIAN]),
YandexVoiceModel(voice_id="john", role=None, languages=[models_tts_languages.YandexLanguageCodesEnum.ENGLISH]),
YandexVoiceModel(voice_id="amira", role=None, languages=[models_tts_languages.YandexLanguageCodesEnum.KAZAKH]),
YandexVoiceModel(voice_id="madi", role=None, languages=[models_tts_languages.YandexLanguageCodesEnum.KAZAKH]),
YandexVoiceModel(voice_id="lea", role=None, languages=[models_tts_languages.YandexLanguageCodesEnum.GERMAN]),
YandexVoiceModel(
voice_id="naomi", role="modern", languages=[models_tts_languages.YandexLanguageCodesEnum.HEBREW]
),
YandexVoiceModel(
voice_id="naomi", role="classic", languages=[models_tts_languages.YandexLanguageCodesEnum.HEBREW]
),
YandexVoiceModel(voice_id="nigora", role=None, languages=[models_tts_languages.YandexLanguageCodesEnum.UZBEK]),
]
@classmethod
def from_api(cls, voice_models_from_api: list[dict[str, typing.Any]]) -> typing.Self:
voice_models = [YandexVoiceModel.model_validate(voice_model) for voice_model in voice_models_from_api]
return YandexListVoiceModelsModel(models=voice_models)

View File

@ -0,0 +1,10 @@
from .models import *
from .repositories import *
from .services import *
__all__ = [
"OpenaiSpeechRepository",
"STTProtocol",
"SpeechService",
"SttVoice",
]

View File

@ -0,0 +1,25 @@
import typing
import pydantic
import lib.app.split_settings as app_split_settings
class SttVoice(pydantic.BaseModel):
audio_size: int
audio_format: str
audio_name: str = "voice"
audio_data: bytes
voice_settings: app_split_settings.VoiceSettings
@pydantic.model_validator(mode="before")
@classmethod
def validate_audio(cls, v: dict[str, typing.Any]) -> dict[str, typing.Any]:
settings: app_split_settings.VoiceSettings = v["voice_settings"]
if v["audio_size"] > settings.max_input_size:
raise ValueError(f"Audio size is too big: {v['audio_size']}")
if v["audio_format"] not in settings.available_formats:
raise ValueError(f"Audio format is not supported: {v['audio_format']}")
if "audio_name" not in v or not v["audio_name"]:
v["audio_name"] = f"audio.{v['audio_format']}"
return v

View File

@ -0,0 +1,47 @@
import mimetypes
import tempfile
import magic
import openai
import lib.app.settings as app_settings
import lib.stt as stt
class OpenaiSpeechRepository:
def __init__(self, settings: app_settings.Settings):
self.settings = settings
openai.api_key = self.settings.openai.api_key.get_secret_value()
@staticmethod
def __get_file_extension_from_bytes(audio: bytes) -> str | None:
mime: magic.Magic = magic.Magic(mime=True)
mime_type: str = mime.from_buffer(audio)
extension: str | None = mimetypes.guess_extension(mime_type)
if extension:
extension = extension.replace(".", "")
return extension
async def speech_to_text(self, audio: bytes) -> str:
file_extension = self.__get_file_extension_from_bytes(audio)
if not file_extension:
raise ValueError("File extension is not supported")
voice: stt.models.SttVoice = stt.models.SttVoice(
audio_size=len(audio) // 1024, # audio size in MB,
audio_format=file_extension,
audio_data=audio,
voice_settings=self.settings.voice,
)
try:
with tempfile.NamedTemporaryFile(suffix=f".{file_extension}") as temp_file:
temp_file.write(voice.audio_data)
temp_file.seek(0)
transcript = openai.Audio.transcribe(self.settings.openai.stt_model, temp_file) # type: ignore
except openai.error.InvalidRequestError as e: # type: ignore[reportGeneralTypeIssues]
raise ValueError(f"OpenAI API error: {e}")
except openai.error.OpenAIError as e: # type: ignore[reportGeneralTypeIssues]
raise ValueError(f"OpenAI API error: {e}")
return transcript.text # type: ignore[reportUnknownVariableType]

View File

@ -0,0 +1,14 @@
import typing
class STTProtocol(typing.Protocol):
async def speech_to_text(self, audio: bytes) -> str:
...
class SpeechService:
def __init__(self, repository: STTProtocol):
self.repository = repository
async def recognize(self, audio: bytes) -> str:
return await self.repository.speech_to_text(audio)

View File

View File

@ -0,0 +1,5 @@
from .protocols import *
__all__ = [
"TTSRepositoryProtocol",
]

View File

@ -0,0 +1,14 @@
import typing
import lib.models as models
class TTSRepositoryProtocol(typing.Protocol):
def get_audio_as_bytes(self, request: models.TTSCreateRequestModel) -> models.TTSCreateResponseModel:
...
def get_voice_model_by_name(self, voice_model_name: str) -> models.BaseVoiceModel | None:
...
def get_voice_models_by_fields(self, fields: models.TTSSearchVoiceRequestModel) -> models.LIST_VOICE_MODELS_TYPE:
...

View File

@ -0,0 +1,5 @@
from .base import *
__all__ = [
"TTSBaseRepository",
]

View File

@ -0,0 +1,57 @@
import abc
import lib.models as models
class HttpClient: # Mocked class todo remove and use real http client from lib.clients.http_client
...
class TTSBaseRepository(abc.ABC):
def __init__(self, client: HttpClient, is_models_from_api: bool = False):
self.http_client = client
self.is_models_from_api = is_models_from_api
@property
@abc.abstractmethod
def voice_models(self) -> models.LIST_VOICE_MODELS_TYPE:
...
@abc.abstractmethod
def get_audio_as_bytes(self, request: models.TTSCreateRequestModel) -> models.TTSCreateResponseModel:
raise NotImplementedError
def get_voice_model_by_name(self, voice_model_name: str) -> models.BaseVoiceModel | None:
"""
Search voice model by name
:param voice_model_name: String name
:return: Voice model that match the name
"""
for voice_model in self.voice_models.models:
if voice_model.voice_name == voice_model_name:
return voice_model
def get_list_voice_models_by_fields(
self, fields: models.TTSSearchVoiceRequestModel
) -> list[models.AVAILABLE_MODELS_TYPE]:
"""
Search voice model by fields
:param fields: Any fields from TTSSearchVoiceRequestModel
:return: All voice models that match the fields
"""
fields_dump = fields.model_dump(exclude_none=True)
voice_models_response = []
for voice_model in self.voice_models.models:
for field, field_value in fields_dump.items():
if field == "languages": # language is a list
language_names: set[str] = {item.name for item in field_value}
voice_model_language_names: set[str] = {item.name for item in voice_model.languages}
if language_names.issubset(voice_model_language_names):
continue
break
voice_model_dump = voice_model.model_dump()
if voice_model_dump[field] != field_value.name:
break
else:
voice_models_response.append(voice_model)
return voice_models_response # type: ignore[reportUnknownVariableType]

View File

@ -0,0 +1,35 @@
import lib.app.settings as app_settings
import lib.models as models
import lib.tts.models as tts_models
class TTSService:
def __init__(
self,
settings: app_settings.Settings,
repositories: dict[models.VoiceModelProvidersEnum, tts_models.TTSRepositoryProtocol],
):
self.settings = settings
self.repositories = repositories
def get_audio_as_bytes(self, request: models.TTSCreateRequestModel) -> models.TTSCreateResponseModel:
model = request.voice_model
repository = self.repositories[model.provider]
audio_response = repository.get_audio_as_bytes(request)
return audio_response
def get_voice_model_by_name(self, voice_model_name: str) -> models.BaseVoiceModel | None:
for repository in self.repositories.values():
voice_model = repository.get_voice_model_by_name(voice_model_name)
if voice_model:
return voice_model
def get_list_voice_models_by_fields(
self, fields: models.TTSSearchVoiceRequestModel
) -> list[models.AVAILABLE_MODELS_TYPE]:
response_models: list[models.AVAILABLE_MODELS_TYPE] = []
for repository in self.repositories.values():
voice_models = repository.get_voice_models_by_fields(fields)
if voice_models.models:
response_models.extend(voice_models.models)
return response_models

1570
src/assistant/poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -27,7 +27,8 @@ fastapi = "0.103.1"
greenlet = "^2.0.2" greenlet = "^2.0.2"
httpx = "^0.25.0" httpx = "^0.25.0"
multidict = "^6.0.4" multidict = "^6.0.4"
orjson = "^3.9.7" openai = "^0.28.1"
orjson = "3.9.7"
psycopg2-binary = "^2.9.9" psycopg2-binary = "^2.9.9"
pydantic = {extras = ["email"], version = "^2.3.0"} pydantic = {extras = ["email"], version = "^2.3.0"}
pydantic-settings = "^2.0.3" pydantic-settings = "^2.0.3"
@ -35,6 +36,7 @@ pytest = "^7.4.2"
pytest-asyncio = "^0.21.1" pytest-asyncio = "^0.21.1"
python = "^3.11" python = "^3.11"
python-jose = "^3.3.0" python-jose = "^3.3.0"
python-magic = "^0.4.27"
sqlalchemy = "^2.0.20" sqlalchemy = "^2.0.20"
uvicorn = "^0.23.2" uvicorn = "^0.23.2"
wrapt = "^1.15.0" wrapt = "^1.15.0"

View File

@ -1,11 +1,12 @@
import pydantic_settings
import lib.app.split_settings.utils as app_split_settings_utils import lib.app.split_settings.utils as app_split_settings_utils
import pydantic_settings
class LoggingSettings(pydantic_settings.BaseSettings): class LoggingSettings(pydantic_settings.BaseSettings):
model_config = pydantic_settings.SettingsConfigDict( model_config = pydantic_settings.SettingsConfigDict(
env_file=app_split_settings_utils.ENV_PATH, env_file_encoding="utf-8", extra="ignore" env_file=app_split_settings_utils.ENV_PATH,
env_file_encoding="utf-8",
extra="ignore",
) )
log_format: str = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" log_format: str = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"