mirror of
https://github.com/ijaric/voice_assistant.git
synced 2025-12-18 05:26:18 +00:00
Merge branch 'features/#45_agent' into tasks/#45_agent
This commit is contained in:
@@ -1,4 +1,10 @@
|
||||
from .agent import AgentHandler
|
||||
from .health import basic_router
|
||||
from .voice_responce_handler import VoiceResponseHandler
|
||||
|
||||
__all__ = ["AgentHandler", "basic_router"]
|
||||
|
||||
__all__ = [
|
||||
"AgentHandler",
|
||||
"VoiceResponseHandler",
|
||||
"basic_router",
|
||||
]
|
||||
|
||||
45
src/assistant/lib/api/v1/handlers/voice_responce_handler.py
Normal file
45
src/assistant/lib/api/v1/handlers/voice_responce_handler.py
Normal file
@@ -0,0 +1,45 @@
|
||||
import http
|
||||
import io
|
||||
|
||||
import fastapi
|
||||
|
||||
import lib.stt.services as stt_services
|
||||
|
||||
# import lib.tts.services as tts_service
|
||||
# import lib.models as models
|
||||
|
||||
|
||||
class VoiceResponseHandler:
|
||||
def __init__(
|
||||
self,
|
||||
stt: stt_services.SpeechService,
|
||||
# tts: tts_service.TTSService,
|
||||
):
|
||||
self.stt = stt
|
||||
# self.tts = tts
|
||||
self.router = fastapi.APIRouter()
|
||||
self.router.add_api_route(
|
||||
"/",
|
||||
self.voice_response,
|
||||
methods=["POST"],
|
||||
summary="Ответ голосового помощника",
|
||||
description="Маршрут возвращает потоковый ответ аудио",
|
||||
)
|
||||
|
||||
async def voice_response(
|
||||
self,
|
||||
voice: bytes = fastapi.File(...),
|
||||
) -> fastapi.responses.StreamingResponse:
|
||||
voice_text: str = await self.stt.recognize(voice)
|
||||
if voice_text == "":
|
||||
raise fastapi.HTTPException(status_code=http.HTTPStatus.BAD_REQUEST, detail="Speech recognition failed")
|
||||
# TODO: Добавить обработку текста через клиента openai
|
||||
# TODO: Добавить синтез речи через клиента tts
|
||||
# TODO: Заменить заглушку на реальный ответ
|
||||
# response = await self.tts.get_audio_as_bytes(
|
||||
# models.TTSCreateRequestModel(
|
||||
# text=voice_text,
|
||||
# )
|
||||
# )
|
||||
# return fastapi.responses.StreamingResponse(io.BytesIO(response.audio_content), media_type="audio/ogg")
|
||||
return fastapi.responses.StreamingResponse(io.BytesIO(voice), media_type="audio/ogg")
|
||||
@@ -1,3 +1,5 @@
|
||||
from .base import HealthResponse
|
||||
|
||||
__all__ = ["HealthResponse"]
|
||||
__all__ = [
|
||||
"HealthResponse",
|
||||
]
|
||||
|
||||
@@ -12,7 +12,9 @@ import lib.app.errors as app_errors
|
||||
import lib.app.settings as app_settings
|
||||
import lib.app.split_settings as app_split_settings
|
||||
import lib.clients as clients
|
||||
import lib.models as models
|
||||
import lib.stt as stt
|
||||
import lib.tts as tts
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -60,15 +62,29 @@ class Application:
|
||||
logger.info("Initializing clients")
|
||||
|
||||
http_yandex_tts_client = clients.AsyncHttpClient(
|
||||
base_url="yandex", # todo add yandex api url from settings
|
||||
proxy_settings=settings.proxy,
|
||||
base_url=settings.tts_yandex.base_url,
|
||||
headers=settings.tts_yandex.base_headers,
|
||||
timeout=settings.tts_yandex.timeout_seconds,
|
||||
)
|
||||
http_eleven_labs_tts_client = clients.AsyncHttpClient(
|
||||
base_url=settings.tts_eleven_labs.base_url,
|
||||
headers=settings.tts_eleven_labs.base_headers,
|
||||
timeout=settings.tts_eleven_labs.timeout_seconds,
|
||||
)
|
||||
|
||||
disposable_resources.append(
|
||||
DisposableResource(
|
||||
name="http_client yandex",
|
||||
dispose_callback=http_yandex_tts_client.close(),
|
||||
)
|
||||
)
|
||||
disposable_resources.append(
|
||||
DisposableResource(
|
||||
name="http_client eleven labs",
|
||||
dispose_callback=http_eleven_labs_tts_client.close(),
|
||||
)
|
||||
)
|
||||
|
||||
# Repositories
|
||||
|
||||
@@ -76,6 +92,16 @@ class Application:
|
||||
stt_repository: stt.STTProtocol = stt.OpenaiSpeechRepository(settings=settings)
|
||||
chat_history_repository = agent.ChatHistoryRepository(pg_async_session=postgres_client.get_async_session())
|
||||
|
||||
tts_yandex_repository = tts.TTSYandexRepository(
|
||||
tts_settings=app_split_settings.TTSYandexSettings(),
|
||||
client=http_yandex_tts_client,
|
||||
)
|
||||
tts_eleven_labs_repository = tts.TTSElevenLabsRepository(
|
||||
tts_settings=app_split_settings.TTSElevenLabsSettings(),
|
||||
client=http_eleven_labs_tts_client,
|
||||
is_models_from_api=True,
|
||||
)
|
||||
|
||||
# Caches
|
||||
|
||||
logger.info("Initializing caches")
|
||||
@@ -85,12 +111,25 @@ class Application:
|
||||
logger.info("Initializing services")
|
||||
stt_service: stt.SpeechService = stt.SpeechService(repository=stt_repository) # type: ignore
|
||||
|
||||
tts_service: tts.TTSService = tts.TTSService( # type: ignore
|
||||
repositories={
|
||||
models.VoiceModelProvidersEnum.YANDEX: tts_yandex_repository,
|
||||
models.VoiceModelProvidersEnum.ELEVEN_LABS: tts_eleven_labs_repository,
|
||||
},
|
||||
)
|
||||
|
||||
# Handlers
|
||||
|
||||
logger.info("Initializing handlers")
|
||||
liveness_probe_handler = api_v1_handlers.basic_router
|
||||
agent_handler = api_v1_handlers.AgentHandler(chat_history_repository=chat_history_repository).router
|
||||
|
||||
# TODO: объявить сервисы tts и openai и добавить их в voice_response_handler
|
||||
voice_response_handler = api_v1_handlers.VoiceResponseHandler(
|
||||
stt=stt_service,
|
||||
# tts=tts_service, # TODO
|
||||
).router
|
||||
|
||||
logger.info("Creating application")
|
||||
|
||||
fastapi_app = fastapi.FastAPI(
|
||||
@@ -104,6 +143,7 @@ class Application:
|
||||
# Routes
|
||||
fastapi_app.include_router(liveness_probe_handler, prefix="/api/v1/health", tags=["health"])
|
||||
fastapi_app.include_router(agent_handler, prefix="/api/v1/agent", tags=["testing"])
|
||||
fastapi_app.include_router(voice_response_handler, prefix="/api/v1/voice", tags=["voice"])
|
||||
|
||||
application = Application(
|
||||
settings=settings,
|
||||
|
||||
@@ -1,24 +1,16 @@
|
||||
import pydantic
|
||||
import pydantic_settings
|
||||
|
||||
import lib.app.split_settings as app_split_settings
|
||||
|
||||
|
||||
class Settings(pydantic_settings.BaseSettings):
|
||||
api: app_split_settings.ApiSettings = pydantic.Field(default_factory=lambda: app_split_settings.ApiSettings())
|
||||
app: app_split_settings.AppSettings = pydantic.Field(default_factory=lambda: app_split_settings.AppSettings())
|
||||
postgres: app_split_settings.PostgresSettings = pydantic.Field(
|
||||
default_factory=lambda: app_split_settings.PostgresSettings()
|
||||
)
|
||||
logger: app_split_settings.LoggingSettings = pydantic.Field(
|
||||
default_factory=lambda: app_split_settings.LoggingSettings()
|
||||
)
|
||||
openai: app_split_settings.OpenaiSettings = pydantic.Field(
|
||||
default_factory=lambda: app_split_settings.OpenaiSettings()
|
||||
)
|
||||
project: app_split_settings.ProjectSettings = pydantic.Field(
|
||||
default_factory=lambda: app_split_settings.ProjectSettings()
|
||||
)
|
||||
|
||||
proxy: app_split_settings.ProxySettings = pydantic.Field(default_factory=lambda: app_split_settings.ProxySettings())
|
||||
voice: app_split_settings.VoiceSettings = pydantic.Field(default_factory=lambda: app_split_settings.VoiceSettings())
|
||||
api: app_split_settings.ApiSettings = app_split_settings.ApiSettings()
|
||||
app: app_split_settings.AppSettings = app_split_settings.AppSettings()
|
||||
postgres: app_split_settings.PostgresSettings = app_split_settings.PostgresSettings()
|
||||
logger: app_split_settings.LoggingSettings = app_split_settings.LoggingSettings()
|
||||
openai: app_split_settings.OpenaiSettings = app_split_settings.OpenaiSettings()
|
||||
project: app_split_settings.ProjectSettings = app_split_settings.ProjectSettings()
|
||||
proxy: app_split_settings.ProxySettings = app_split_settings.ProxySettings()
|
||||
voice: app_split_settings.VoiceSettings = app_split_settings.VoiceSettings()
|
||||
tts_yandex: app_split_settings.TTSYandexSettings = app_split_settings.TTSYandexSettings()
|
||||
tts_eleven_labs: app_split_settings.TTSElevenLabsSettings = app_split_settings.TTSElevenLabsSettings()
|
||||
|
||||
@@ -5,6 +5,7 @@ from .openai import *
|
||||
from .postgres import *
|
||||
from .project import *
|
||||
from .proxy import *
|
||||
from .tts import *
|
||||
from .voice import *
|
||||
|
||||
__all__ = [
|
||||
@@ -15,6 +16,8 @@ __all__ = [
|
||||
"PostgresSettings",
|
||||
"ProjectSettings",
|
||||
"ProxySettings",
|
||||
"TTSElevenLabsSettings",
|
||||
"TTSYandexSettings",
|
||||
"VoiceSettings",
|
||||
"get_logging_config",
|
||||
]
|
||||
|
||||
7
src/assistant/lib/app/split_settings/tts/__init__.py
Normal file
7
src/assistant/lib/app/split_settings/tts/__init__.py
Normal file
@@ -0,0 +1,7 @@
|
||||
from .eleven_labs import *
|
||||
from .yandex import *
|
||||
|
||||
__all__ = [
|
||||
"TTSElevenLabsSettings",
|
||||
"TTSYandexSettings",
|
||||
]
|
||||
26
src/assistant/lib/app/split_settings/tts/eleven_labs.py
Normal file
26
src/assistant/lib/app/split_settings/tts/eleven_labs.py
Normal file
@@ -0,0 +1,26 @@
|
||||
import pydantic
|
||||
import pydantic_settings
|
||||
|
||||
import lib.app.split_settings.utils as app_split_settings_utils
|
||||
|
||||
|
||||
class TTSElevenLabsSettings(pydantic_settings.BaseSettings):
|
||||
model_config = pydantic_settings.SettingsConfigDict(
|
||||
env_file=app_split_settings_utils.ENV_PATH,
|
||||
env_prefix="TTS_ELEVEN_LABS_",
|
||||
env_file_encoding="utf-8",
|
||||
extra="ignore",
|
||||
)
|
||||
|
||||
api_key: pydantic.SecretStr = pydantic.Field(default=...)
|
||||
default_voice_id: str = "EXAVITQu4vr4xnSDxMaL"
|
||||
base_url: str = "https://api.elevenlabs.io/v1/"
|
||||
timeout_seconds: int = 30
|
||||
|
||||
@property
|
||||
def base_headers(self) -> dict[str, str]:
|
||||
return {
|
||||
"Accept": "audio/mpeg",
|
||||
"Content-Type": "application/json",
|
||||
"xi-api-key": self.api_key.get_secret_value(),
|
||||
}
|
||||
28
src/assistant/lib/app/split_settings/tts/yandex.py
Normal file
28
src/assistant/lib/app/split_settings/tts/yandex.py
Normal file
@@ -0,0 +1,28 @@
|
||||
import typing
|
||||
|
||||
import pydantic
|
||||
import pydantic_settings
|
||||
|
||||
import lib.app.split_settings.utils as app_split_settings_utils
|
||||
|
||||
|
||||
class TTSYandexSettings(pydantic_settings.BaseSettings):
|
||||
model_config = pydantic_settings.SettingsConfigDict(
|
||||
env_file=app_split_settings_utils.ENV_PATH,
|
||||
env_prefix="TTS_YANDEX_",
|
||||
env_file_encoding="utf-8",
|
||||
extra="ignore",
|
||||
)
|
||||
|
||||
audio_format: typing.Literal["oggopus", "mp3", "lpcm"] = "oggopus"
|
||||
sample_rate_hertz: int = 48000
|
||||
api_key: pydantic.SecretStr = pydantic.Field(default=...)
|
||||
base_url: str = "https://tts.api.cloud.yandex.net/speech/v1/"
|
||||
timeout_seconds: int = 30
|
||||
|
||||
@property
|
||||
def base_headers(self) -> dict[str, str]:
|
||||
return {
|
||||
"Authorization": f"Api-Key {self.api_key.get_secret_value()}",
|
||||
"Content-Type": "application/x-www-form-urlencoded",
|
||||
}
|
||||
@@ -8,7 +8,7 @@ import lib.app.split_settings as app_split_settings
|
||||
class AsyncHttpClient(httpx.AsyncClient):
|
||||
def __init__(
|
||||
self,
|
||||
proxy_settings: app_split_settings.ProxySettings,
|
||||
proxy_settings: app_split_settings.ProxySettings | None = None,
|
||||
base_url: str | None = None,
|
||||
**client_params: typing.Any,
|
||||
) -> None:
|
||||
@@ -17,10 +17,10 @@ class AsyncHttpClient(httpx.AsyncClient):
|
||||
self.proxies = self.__get_proxies_from_settings()
|
||||
self.client_params = client_params
|
||||
|
||||
super().__init__(base_url=self.base_url, proxies=self.proxies, **client_params)
|
||||
super().__init__(base_url=self.base_url, proxies=self.proxies, **client_params) # type: ignore[reportGeneralTypeIssues]
|
||||
|
||||
def __get_proxies_from_settings(self) -> dict[str, str] | None:
|
||||
if not self.proxy_settings.enable:
|
||||
if not self.proxy_settings or not self.proxy_settings.enable:
|
||||
return None
|
||||
proxies = {"all://": self.proxy_settings.dsn}
|
||||
return proxies
|
||||
|
||||
@@ -2,5 +2,27 @@ from .chat_history import Message, RequestChatHistory, RequestChatMessage, Reque
|
||||
from .embedding import Embedding
|
||||
from .movies import Movie
|
||||
from .token import Token
|
||||
from .tts import *
|
||||
|
||||
|
||||
__all__ = ["Embedding", "Message", "Movie", "RequestChatHistory", "RequestChatMessage", "RequestLastSessionId", "Token"]
|
||||
__all__ = [
|
||||
"AVAILABLE_MODELS_TYPE",
|
||||
"Base",
|
||||
"BaseLanguageCodesEnum",
|
||||
"BaseVoiceModel",
|
||||
"ElevenLabsLanguageCodesEnum",
|
||||
"ElevenLabsListVoiceModelsModel",
|
||||
"ElevenLabsVoiceModel",
|
||||
"IdCreatedUpdatedBaseMixin",
|
||||
"LANGUAGE_CODES_ENUM_TYPE",
|
||||
"LIST_VOICE_MODELS_TYPE",
|
||||
"TTSCreateRequestModel",
|
||||
"TTSCreateResponseModel",
|
||||
"TTSSearchVoiceRequestModel",
|
||||
"Token",
|
||||
"VoiceModelProvidersEnum",
|
||||
"YandexLanguageCodesEnum",
|
||||
"YandexListVoiceModelsModel",
|
||||
"YandexVoiceModel",
|
||||
]
|
||||
|
||||
20
src/assistant/lib/models/tts/__init__.py
Normal file
20
src/assistant/lib/models/tts/__init__.py
Normal file
@@ -0,0 +1,20 @@
|
||||
from .models import *
|
||||
from .voice import *
|
||||
|
||||
__all__ = [
|
||||
"AVAILABLE_MODELS_TYPE",
|
||||
"BaseLanguageCodesEnum",
|
||||
"BaseVoiceModel",
|
||||
"ElevenLabsLanguageCodesEnum",
|
||||
"ElevenLabsListVoiceModelsModel",
|
||||
"ElevenLabsVoiceModel",
|
||||
"LANGUAGE_CODES_ENUM_TYPE",
|
||||
"LIST_VOICE_MODELS_TYPE",
|
||||
"TTSCreateRequestModel",
|
||||
"TTSCreateResponseModel",
|
||||
"TTSSearchVoiceRequestModel",
|
||||
"VoiceModelProvidersEnum",
|
||||
"YandexLanguageCodesEnum",
|
||||
"YandexListVoiceModelsModel",
|
||||
"YandexVoiceModel",
|
||||
]
|
||||
64
src/assistant/lib/models/tts/models.py
Normal file
64
src/assistant/lib/models/tts/models.py
Normal file
@@ -0,0 +1,64 @@
|
||||
import pydantic
|
||||
|
||||
import lib.models.tts.voice as models_tts_voice
|
||||
import lib.models.tts.voice.languages as models_tts_languages
|
||||
|
||||
AVAILABLE_MODELS_TYPE = models_tts_voice.YandexVoiceModel | models_tts_voice.ElevenLabsVoiceModel
|
||||
LIST_VOICE_MODELS_TYPE = models_tts_voice.YandexListVoiceModelsModel | models_tts_voice.ElevenLabsListVoiceModelsModel
|
||||
DEFAULT_MODEL = models_tts_voice.ElevenLabsVoiceModel(
|
||||
voice_id="eleven_multilingual_v2",
|
||||
languages=[
|
||||
models_tts_languages.ElevenLabsLanguageCodesEnum.ENGLISH,
|
||||
models_tts_languages.ElevenLabsLanguageCodesEnum.JAPANESE,
|
||||
models_tts_languages.ElevenLabsLanguageCodesEnum.CHINESE,
|
||||
models_tts_languages.ElevenLabsLanguageCodesEnum.GERMAN,
|
||||
models_tts_languages.ElevenLabsLanguageCodesEnum.HINDI,
|
||||
models_tts_languages.ElevenLabsLanguageCodesEnum.FRENCH,
|
||||
models_tts_languages.ElevenLabsLanguageCodesEnum.KOREAN,
|
||||
models_tts_languages.ElevenLabsLanguageCodesEnum.PORTUGUESE,
|
||||
models_tts_languages.ElevenLabsLanguageCodesEnum.ITALIAN,
|
||||
models_tts_languages.ElevenLabsLanguageCodesEnum.SPANISH,
|
||||
models_tts_languages.ElevenLabsLanguageCodesEnum.INDONESIAN,
|
||||
models_tts_languages.ElevenLabsLanguageCodesEnum.DUTCH,
|
||||
models_tts_languages.ElevenLabsLanguageCodesEnum.TURKISH,
|
||||
models_tts_languages.ElevenLabsLanguageCodesEnum.FILIPINO,
|
||||
models_tts_languages.ElevenLabsLanguageCodesEnum.POLISH,
|
||||
models_tts_languages.ElevenLabsLanguageCodesEnum.SWEDISH,
|
||||
models_tts_languages.ElevenLabsLanguageCodesEnum.BULGARIAN,
|
||||
models_tts_languages.ElevenLabsLanguageCodesEnum.ROMANIAN,
|
||||
models_tts_languages.ElevenLabsLanguageCodesEnum.ARABIC,
|
||||
models_tts_languages.ElevenLabsLanguageCodesEnum.CZECH,
|
||||
models_tts_languages.ElevenLabsLanguageCodesEnum.GREEK,
|
||||
models_tts_languages.ElevenLabsLanguageCodesEnum.FINNISH,
|
||||
models_tts_languages.ElevenLabsLanguageCodesEnum.CROATIAN,
|
||||
models_tts_languages.ElevenLabsLanguageCodesEnum.MALAY,
|
||||
models_tts_languages.ElevenLabsLanguageCodesEnum.SLOVAK,
|
||||
models_tts_languages.ElevenLabsLanguageCodesEnum.DANISH,
|
||||
models_tts_languages.ElevenLabsLanguageCodesEnum.TAMIL,
|
||||
models_tts_languages.ElevenLabsLanguageCodesEnum.UKRAINIAN,
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
class TTSCreateRequestModel(pydantic.BaseModel):
|
||||
model_config = pydantic.ConfigDict(use_enum_values=True)
|
||||
|
||||
voice_model: AVAILABLE_MODELS_TYPE = DEFAULT_MODEL
|
||||
text: str
|
||||
|
||||
|
||||
class TTSCreateResponseModel(pydantic.BaseModel):
|
||||
audio_content: bytes
|
||||
|
||||
|
||||
class TTSSearchVoiceRequestModel(pydantic.BaseModel):
|
||||
voice_id: str | None = None
|
||||
voice_name: str | None = None
|
||||
languages: list[models_tts_languages.LANGUAGE_CODES_ENUM_TYPE] | None = None
|
||||
company_name: str | None = None
|
||||
|
||||
@pydantic.model_validator(mode="after")
|
||||
def check_at_least_one_field(self):
|
||||
if not any((self.voice_name, self.languages, self.company_name)):
|
||||
raise ValueError("At least one field required")
|
||||
return self
|
||||
17
src/assistant/lib/models/tts/voice/__init__.py
Normal file
17
src/assistant/lib/models/tts/voice/__init__.py
Normal file
@@ -0,0 +1,17 @@
|
||||
from .base import *
|
||||
from .eleven_labs import *
|
||||
from .languages import *
|
||||
from .yandex import *
|
||||
|
||||
__all__ = [
|
||||
"BaseLanguageCodesEnum",
|
||||
"BaseVoiceModel",
|
||||
"ElevenLabsLanguageCodesEnum",
|
||||
"ElevenLabsListVoiceModelsModel",
|
||||
"ElevenLabsVoiceModel",
|
||||
"LANGUAGE_CODES_ENUM_TYPE",
|
||||
"VoiceModelProvidersEnum",
|
||||
"YandexLanguageCodesEnum",
|
||||
"YandexListVoiceModelsModel",
|
||||
"YandexVoiceModel",
|
||||
]
|
||||
29
src/assistant/lib/models/tts/voice/base.py
Normal file
29
src/assistant/lib/models/tts/voice/base.py
Normal file
@@ -0,0 +1,29 @@
|
||||
import enum
|
||||
import typing
|
||||
|
||||
import pydantic
|
||||
|
||||
import lib.models.tts.voice.languages as models_tts_languages
|
||||
|
||||
|
||||
class VoiceModelProvidersEnum(enum.Enum):
|
||||
YANDEX = "yandex"
|
||||
ELEVEN_LABS = "eleven_labs"
|
||||
|
||||
|
||||
class BaseVoiceModel(pydantic.BaseModel):
|
||||
voice_id: str
|
||||
voice_name: str | None = None
|
||||
languages: list[models_tts_languages.LANGUAGE_CODES_ENUM_TYPE]
|
||||
provider: VoiceModelProvidersEnum
|
||||
|
||||
@pydantic.model_validator(mode="before")
|
||||
@classmethod
|
||||
def check_voice_name_exists(cls, data: typing.Any) -> typing.Any:
|
||||
if not data:
|
||||
return data
|
||||
voice_id = data.get("voice_id")
|
||||
voice_name = data.get("voice_name")
|
||||
if not voice_name and voice_id:
|
||||
data["voice_name"] = voice_id
|
||||
return data
|
||||
83
src/assistant/lib/models/tts/voice/eleven_labs.py
Normal file
83
src/assistant/lib/models/tts/voice/eleven_labs.py
Normal file
@@ -0,0 +1,83 @@
|
||||
import typing
|
||||
|
||||
import pydantic
|
||||
|
||||
import lib.models.tts.voice.base as models_tts_base
|
||||
import lib.models.tts.voice.languages as models_tts_languages
|
||||
|
||||
|
||||
class ElevenLabsVoiceModel(models_tts_base.BaseVoiceModel):
|
||||
model_config = pydantic.ConfigDict(use_enum_values=True)
|
||||
voice_id: str
|
||||
voice_name: str | None = None
|
||||
languages: list[models_tts_languages.LANGUAGE_CODES_ENUM_TYPE]
|
||||
provider: models_tts_base.VoiceModelProvidersEnum = models_tts_base.VoiceModelProvidersEnum.ELEVEN_LABS
|
||||
|
||||
|
||||
class ElevenLabsListVoiceModelsModel(pydantic.BaseModel):
|
||||
models: list[ElevenLabsVoiceModel] = [
|
||||
ElevenLabsVoiceModel(
|
||||
voice_id="eleven_multilingual_v1",
|
||||
languages=[
|
||||
models_tts_languages.ElevenLabsLanguageCodesEnum.ENGLISH,
|
||||
models_tts_languages.ElevenLabsLanguageCodesEnum.GERMAN,
|
||||
models_tts_languages.ElevenLabsLanguageCodesEnum.POLISH,
|
||||
models_tts_languages.ElevenLabsLanguageCodesEnum.SPANISH,
|
||||
models_tts_languages.ElevenLabsLanguageCodesEnum.ITALIAN,
|
||||
models_tts_languages.ElevenLabsLanguageCodesEnum.FRENCH,
|
||||
models_tts_languages.ElevenLabsLanguageCodesEnum.PORTUGUESE,
|
||||
models_tts_languages.ElevenLabsLanguageCodesEnum.HINDI,
|
||||
models_tts_languages.ElevenLabsLanguageCodesEnum.ARABIC,
|
||||
],
|
||||
),
|
||||
ElevenLabsVoiceModel(
|
||||
voice_id="eleven_multilingual_v2",
|
||||
languages=[
|
||||
models_tts_languages.ElevenLabsLanguageCodesEnum.ENGLISH,
|
||||
models_tts_languages.ElevenLabsLanguageCodesEnum.JAPANESE,
|
||||
models_tts_languages.ElevenLabsLanguageCodesEnum.CHINESE,
|
||||
models_tts_languages.ElevenLabsLanguageCodesEnum.GERMAN,
|
||||
models_tts_languages.ElevenLabsLanguageCodesEnum.HINDI,
|
||||
models_tts_languages.ElevenLabsLanguageCodesEnum.FRENCH,
|
||||
models_tts_languages.ElevenLabsLanguageCodesEnum.KOREAN,
|
||||
models_tts_languages.ElevenLabsLanguageCodesEnum.PORTUGUESE,
|
||||
models_tts_languages.ElevenLabsLanguageCodesEnum.ITALIAN,
|
||||
models_tts_languages.ElevenLabsLanguageCodesEnum.SPANISH,
|
||||
models_tts_languages.ElevenLabsLanguageCodesEnum.INDONESIAN,
|
||||
models_tts_languages.ElevenLabsLanguageCodesEnum.DUTCH,
|
||||
models_tts_languages.ElevenLabsLanguageCodesEnum.TURKISH,
|
||||
models_tts_languages.ElevenLabsLanguageCodesEnum.FILIPINO,
|
||||
models_tts_languages.ElevenLabsLanguageCodesEnum.POLISH,
|
||||
models_tts_languages.ElevenLabsLanguageCodesEnum.SWEDISH,
|
||||
models_tts_languages.ElevenLabsLanguageCodesEnum.BULGARIAN,
|
||||
models_tts_languages.ElevenLabsLanguageCodesEnum.ROMANIAN,
|
||||
models_tts_languages.ElevenLabsLanguageCodesEnum.ARABIC,
|
||||
models_tts_languages.ElevenLabsLanguageCodesEnum.CZECH,
|
||||
models_tts_languages.ElevenLabsLanguageCodesEnum.GREEK,
|
||||
models_tts_languages.ElevenLabsLanguageCodesEnum.FINNISH,
|
||||
models_tts_languages.ElevenLabsLanguageCodesEnum.CROATIAN,
|
||||
models_tts_languages.ElevenLabsLanguageCodesEnum.MALAY,
|
||||
models_tts_languages.ElevenLabsLanguageCodesEnum.SLOVAK,
|
||||
models_tts_languages.ElevenLabsLanguageCodesEnum.DANISH,
|
||||
models_tts_languages.ElevenLabsLanguageCodesEnum.TAMIL,
|
||||
models_tts_languages.ElevenLabsLanguageCodesEnum.UKRAINIAN,
|
||||
],
|
||||
),
|
||||
ElevenLabsVoiceModel(
|
||||
voice_id="eleven_multilingual_v2",
|
||||
languages=[models_tts_languages.ElevenLabsLanguageCodesEnum.ENGLISH],
|
||||
),
|
||||
]
|
||||
|
||||
@classmethod
|
||||
def from_api(cls, voice_models_from_api: list[dict[str, typing.Any]]) -> typing.Self:
|
||||
voice_models = []
|
||||
for voice_model in voice_models_from_api:
|
||||
voice_model["voice_id"] = voice_model.pop("model_id")
|
||||
voice_model["voice_name"] = voice_model.pop("name")
|
||||
voice_model["languages"] = [
|
||||
models_tts_languages.ElevenLabsLanguageCodesEnum(item.get("language_id"))
|
||||
for item in voice_model.pop("languages")
|
||||
]
|
||||
voice_models.append(ElevenLabsVoiceModel.model_validate(voice_model))
|
||||
return ElevenLabsListVoiceModelsModel(models=voice_models)
|
||||
83
src/assistant/lib/models/tts/voice/languages.py
Normal file
83
src/assistant/lib/models/tts/voice/languages.py
Normal file
@@ -0,0 +1,83 @@
|
||||
import enum
|
||||
|
||||
|
||||
class BaseLanguageCodesEnum(enum.Enum):
|
||||
RUSSIAN = "ru"
|
||||
ENGLISH = "en"
|
||||
KAZAKH = "kk"
|
||||
GERMAN = "de"
|
||||
HEBREW = "he"
|
||||
UZBEK = "uz"
|
||||
JAPANESE = "ja"
|
||||
CHINESE = "zh"
|
||||
HINDI = "hi"
|
||||
FRENCH = "fr"
|
||||
KOREAN = "ko"
|
||||
PORTUGUESE = "pt"
|
||||
ITALIAN = "it"
|
||||
SPANISH = "es"
|
||||
INDONESIAN = "id"
|
||||
DUTCH = "nl"
|
||||
TURKISH = "tr"
|
||||
FILIPINO = "fil"
|
||||
POLISH = "pl"
|
||||
SWEDISH = "sv"
|
||||
BULGARIAN = "bg"
|
||||
ROMANIAN = "ro"
|
||||
ARABIC = "ar"
|
||||
CZECH = "cs"
|
||||
GREEK = "el"
|
||||
FINNISH = "fi"
|
||||
CROATIAN = "hr"
|
||||
MALAY = "ms"
|
||||
SLOVAK = "sk"
|
||||
DANISH = "da"
|
||||
TAMIL = "ta"
|
||||
UKRAINIAN = "uk"
|
||||
|
||||
|
||||
class ElevenLabsLanguageCodesEnum(enum.Enum):
|
||||
RUSSIAN = "ru"
|
||||
ENGLISH = "en"
|
||||
KAZAKH = "kk"
|
||||
GERMAN = "de"
|
||||
HEBREW = "he"
|
||||
UZBEK = "uz"
|
||||
JAPANESE = "ja"
|
||||
CHINESE = "zh"
|
||||
HINDI = "hi"
|
||||
FRENCH = "fr"
|
||||
KOREAN = "ko"
|
||||
PORTUGUESE = "pt"
|
||||
ITALIAN = "it"
|
||||
SPANISH = "es"
|
||||
INDONESIAN = "id"
|
||||
DUTCH = "nl"
|
||||
TURKISH = "tr"
|
||||
FILIPINO = "fil"
|
||||
POLISH = "pl"
|
||||
SWEDISH = "sv"
|
||||
BULGARIAN = "bg"
|
||||
ROMANIAN = "ro"
|
||||
ARABIC = "ar"
|
||||
CZECH = "cs"
|
||||
GREEK = "el"
|
||||
FINNISH = "fi"
|
||||
CROATIAN = "hr"
|
||||
MALAY = "ms"
|
||||
SLOVAK = "sk"
|
||||
DANISH = "da"
|
||||
TAMIL = "ta"
|
||||
UKRAINIAN = "uk"
|
||||
|
||||
|
||||
class YandexLanguageCodesEnum(enum.Enum):
|
||||
RUSSIAN = "ru-RU"
|
||||
ENGLISH = "en-US"
|
||||
KAZAKH = "kk-KK"
|
||||
GERMAN = "de-DE"
|
||||
HEBREW = "he-IL"
|
||||
UZBEK = "uz-UZ"
|
||||
|
||||
|
||||
LANGUAGE_CODES_ENUM_TYPE = BaseLanguageCodesEnum | ElevenLabsLanguageCodesEnum | YandexLanguageCodesEnum
|
||||
99
src/assistant/lib/models/tts/voice/yandex.py
Normal file
99
src/assistant/lib/models/tts/voice/yandex.py
Normal file
@@ -0,0 +1,99 @@
|
||||
import typing
|
||||
|
||||
import pydantic
|
||||
|
||||
import lib.models.tts.voice.base as models_tts_base
|
||||
import lib.models.tts.voice.languages as models_tts_languages
|
||||
|
||||
|
||||
class YandexVoiceModel(models_tts_base.BaseVoiceModel):
|
||||
voice_id: str
|
||||
voice_name: str | None = None
|
||||
languages: list[models_tts_languages.LANGUAGE_CODES_ENUM_TYPE]
|
||||
provider: models_tts_base.VoiceModelProvidersEnum = models_tts_base.VoiceModelProvidersEnum.YANDEX
|
||||
role: str | None = None
|
||||
|
||||
@pydantic.model_validator(mode="before")
|
||||
@classmethod
|
||||
def check_voice_name_exists(cls, data: typing.Any) -> typing.Any:
|
||||
if not data:
|
||||
return data
|
||||
voice_id = data.get("voice_id")
|
||||
voice_name = data.get("voice_name")
|
||||
role = data.get("role")
|
||||
if not voice_name and voice_id:
|
||||
data["voice_name"] = f"{voice_id} {role}" if role else voice_id
|
||||
return data
|
||||
|
||||
|
||||
class YandexListVoiceModelsModel(pydantic.BaseModel):
|
||||
models: list[YandexVoiceModel] = [
|
||||
YandexVoiceModel(
|
||||
voice_id="ermil", role="neutral", languages=[models_tts_languages.YandexLanguageCodesEnum.RUSSIAN]
|
||||
),
|
||||
YandexVoiceModel(
|
||||
voice_id="ermil", role="good", languages=[models_tts_languages.YandexLanguageCodesEnum.RUSSIAN]
|
||||
),
|
||||
YandexVoiceModel(
|
||||
voice_id="alena", role="neutral", languages=[models_tts_languages.YandexLanguageCodesEnum.RUSSIAN]
|
||||
),
|
||||
YandexVoiceModel(
|
||||
voice_id="alena", role="good", languages=[models_tts_languages.YandexLanguageCodesEnum.RUSSIAN]
|
||||
),
|
||||
YandexVoiceModel(
|
||||
voice_id="jane", role="neutral", languages=[models_tts_languages.YandexLanguageCodesEnum.RUSSIAN]
|
||||
),
|
||||
YandexVoiceModel(
|
||||
voice_id="jane", role="good", languages=[models_tts_languages.YandexLanguageCodesEnum.RUSSIAN]
|
||||
),
|
||||
YandexVoiceModel(
|
||||
voice_id="jane", role="evil", languages=[models_tts_languages.YandexLanguageCodesEnum.RUSSIAN]
|
||||
),
|
||||
YandexVoiceModel(
|
||||
voice_id="omazh", role="neutral", languages=[models_tts_languages.YandexLanguageCodesEnum.RUSSIAN]
|
||||
),
|
||||
YandexVoiceModel(
|
||||
voice_id="omazh", role="evil", languages=[models_tts_languages.YandexLanguageCodesEnum.RUSSIAN]
|
||||
),
|
||||
YandexVoiceModel(
|
||||
voice_id="zahar", role="neutral", languages=[models_tts_languages.YandexLanguageCodesEnum.RUSSIAN]
|
||||
),
|
||||
YandexVoiceModel(
|
||||
voice_id="zahar", role="good", languages=[models_tts_languages.YandexLanguageCodesEnum.RUSSIAN]
|
||||
),
|
||||
YandexVoiceModel(
|
||||
voice_id="filipp", role=None, languages=[models_tts_languages.YandexLanguageCodesEnum.RUSSIAN]
|
||||
),
|
||||
YandexVoiceModel(
|
||||
voice_id="madirus", role=None, languages=[models_tts_languages.YandexLanguageCodesEnum.RUSSIAN]
|
||||
),
|
||||
YandexVoiceModel(voice_id="dasha", role=None, languages=[models_tts_languages.YandexLanguageCodesEnum.RUSSIAN]),
|
||||
YandexVoiceModel(voice_id="julia", role=None, languages=[models_tts_languages.YandexLanguageCodesEnum.RUSSIAN]),
|
||||
YandexVoiceModel(voice_id="lera", role=None, languages=[models_tts_languages.YandexLanguageCodesEnum.RUSSIAN]),
|
||||
YandexVoiceModel(
|
||||
voice_id="marina", role=None, languages=[models_tts_languages.YandexLanguageCodesEnum.RUSSIAN]
|
||||
),
|
||||
YandexVoiceModel(
|
||||
voice_id="alexander", role=None, languages=[models_tts_languages.YandexLanguageCodesEnum.RUSSIAN]
|
||||
),
|
||||
YandexVoiceModel(
|
||||
voice_id="kirill", role=None, languages=[models_tts_languages.YandexLanguageCodesEnum.RUSSIAN]
|
||||
),
|
||||
YandexVoiceModel(voice_id="anton", role=None, languages=[models_tts_languages.YandexLanguageCodesEnum.RUSSIAN]),
|
||||
YandexVoiceModel(voice_id="john", role=None, languages=[models_tts_languages.YandexLanguageCodesEnum.ENGLISH]),
|
||||
YandexVoiceModel(voice_id="amira", role=None, languages=[models_tts_languages.YandexLanguageCodesEnum.KAZAKH]),
|
||||
YandexVoiceModel(voice_id="madi", role=None, languages=[models_tts_languages.YandexLanguageCodesEnum.KAZAKH]),
|
||||
YandexVoiceModel(voice_id="lea", role=None, languages=[models_tts_languages.YandexLanguageCodesEnum.GERMAN]),
|
||||
YandexVoiceModel(
|
||||
voice_id="naomi", role="modern", languages=[models_tts_languages.YandexLanguageCodesEnum.HEBREW]
|
||||
),
|
||||
YandexVoiceModel(
|
||||
voice_id="naomi", role="classic", languages=[models_tts_languages.YandexLanguageCodesEnum.HEBREW]
|
||||
),
|
||||
YandexVoiceModel(voice_id="nigora", role=None, languages=[models_tts_languages.YandexLanguageCodesEnum.UZBEK]),
|
||||
]
|
||||
|
||||
@classmethod
|
||||
def from_api(cls, voice_models_from_api: list[dict[str, typing.Any]]) -> typing.Self:
|
||||
voice_models = [YandexVoiceModel.model_validate(voice_model) for voice_model in voice_models_from_api]
|
||||
return YandexListVoiceModelsModel(models=voice_models)
|
||||
@@ -1,8 +1,11 @@
|
||||
import http
|
||||
import mimetypes
|
||||
import tempfile
|
||||
|
||||
import fastapi
|
||||
import magic
|
||||
import openai
|
||||
import pydantic
|
||||
|
||||
import lib.app.settings as app_settings
|
||||
import lib.stt as stt
|
||||
@@ -24,15 +27,24 @@ class OpenaiSpeechRepository:
|
||||
|
||||
async def speech_to_text(self, audio: bytes) -> str:
|
||||
file_extension = self.__get_file_extension_from_bytes(audio)
|
||||
if not file_extension:
|
||||
raise ValueError("File extension is not supported")
|
||||
|
||||
voice: stt.models.SttVoice = stt.models.SttVoice(
|
||||
audio_size=len(audio) // 1024, # audio size in MB,
|
||||
audio_format=file_extension,
|
||||
audio_data=audio,
|
||||
voice_settings=self.settings.voice,
|
||||
)
|
||||
if not file_extension or file_extension not in self.settings.voice.available_formats:
|
||||
raise fastapi.HTTPException(
|
||||
status_code=http.HTTPStatus.UNSUPPORTED_MEDIA_TYPE,
|
||||
detail=f"File extension is not supported. "
|
||||
f"Available extensions: {self.settings.voice.available_formats}",
|
||||
)
|
||||
try:
|
||||
voice: stt.models.SttVoice = stt.models.SttVoice(
|
||||
audio_size=len(audio) // 1024, # audio size in MB,
|
||||
audio_format=file_extension,
|
||||
audio_data=audio,
|
||||
voice_settings=self.settings.voice,
|
||||
)
|
||||
except (pydantic.ValidationError, ValueError) as e:
|
||||
raise fastapi.HTTPException(
|
||||
status_code=http.HTTPStatus.BAD_REQUEST,
|
||||
detail=f"Voice validation error: {e}",
|
||||
)
|
||||
|
||||
try:
|
||||
with tempfile.NamedTemporaryFile(suffix=f".{file_extension}") as temp_file:
|
||||
@@ -40,8 +52,14 @@ class OpenaiSpeechRepository:
|
||||
temp_file.seek(0)
|
||||
transcript = openai.Audio.transcribe(self.settings.openai.stt_model, temp_file) # type: ignore
|
||||
except openai.error.InvalidRequestError as e: # type: ignore[reportGeneralTypeIssues]
|
||||
raise ValueError(f"OpenAI API error: {e}")
|
||||
raise fastapi.HTTPException(
|
||||
status_code=http.HTTPStatus.BAD_REQUEST,
|
||||
detail=f"OpenAI request error: {e}",
|
||||
)
|
||||
except openai.error.OpenAIError as e: # type: ignore[reportGeneralTypeIssues]
|
||||
raise ValueError(f"OpenAI API error: {e}")
|
||||
raise fastapi.HTTPException(
|
||||
status_code=http.HTTPStatus.BAD_REQUEST,
|
||||
detail=f"OpenAI API error: {e}",
|
||||
)
|
||||
|
||||
return transcript.text # type: ignore[reportUnknownVariableType]
|
||||
|
||||
9
src/assistant/lib/tts/__init__.py
Normal file
9
src/assistant/lib/tts/__init__.py
Normal file
@@ -0,0 +1,9 @@
|
||||
from .repositories import *
|
||||
from .services import *
|
||||
|
||||
__all__ = [
|
||||
"TTSBaseRepository",
|
||||
"TTSElevenLabsRepository",
|
||||
"TTSService",
|
||||
"TTSYandexRepository",
|
||||
]
|
||||
5
src/assistant/lib/tts/models/__init__.py
Normal file
5
src/assistant/lib/tts/models/__init__.py
Normal file
@@ -0,0 +1,5 @@
|
||||
from .protocols import *
|
||||
|
||||
__all__ = [
|
||||
"TTSRepositoryProtocol",
|
||||
]
|
||||
16
src/assistant/lib/tts/models/protocols.py
Normal file
16
src/assistant/lib/tts/models/protocols.py
Normal file
@@ -0,0 +1,16 @@
|
||||
import typing
|
||||
|
||||
import lib.models as models
|
||||
|
||||
|
||||
class TTSRepositoryProtocol(typing.Protocol):
|
||||
async def get_audio_as_bytes(self, request: models.TTSCreateRequestModel) -> models.TTSCreateResponseModel:
|
||||
...
|
||||
|
||||
async def get_voice_model_by_name(self, voice_model_name: str) -> models.BaseVoiceModel | None:
|
||||
...
|
||||
|
||||
async def get_voice_models_by_fields(
|
||||
self, fields: models.TTSSearchVoiceRequestModel
|
||||
) -> models.LIST_VOICE_MODELS_TYPE:
|
||||
...
|
||||
9
src/assistant/lib/tts/repositories/__init__.py
Normal file
9
src/assistant/lib/tts/repositories/__init__.py
Normal file
@@ -0,0 +1,9 @@
|
||||
from .base import *
|
||||
from .eleven_labs import *
|
||||
from .yandex import *
|
||||
|
||||
__all__ = [
|
||||
"TTSBaseRepository",
|
||||
"TTSElevenLabsRepository",
|
||||
"TTSYandexRepository",
|
||||
]
|
||||
56
src/assistant/lib/tts/repositories/base.py
Normal file
56
src/assistant/lib/tts/repositories/base.py
Normal file
@@ -0,0 +1,56 @@
|
||||
import abc
|
||||
|
||||
import lib.clients as clients
|
||||
import lib.models as models
|
||||
|
||||
|
||||
class TTSBaseRepository(abc.ABC):
|
||||
def __init__(self, client: clients.AsyncHttpClient, is_models_from_api: bool = False):
|
||||
self.http_client = client
|
||||
self.is_models_from_api = is_models_from_api
|
||||
|
||||
@property
|
||||
@abc.abstractmethod
|
||||
async def voice_models(self) -> models.LIST_VOICE_MODELS_TYPE:
|
||||
raise NotImplementedError
|
||||
|
||||
@abc.abstractmethod
|
||||
async def get_audio_as_bytes(self, request: models.TTSCreateRequestModel) -> models.TTSCreateResponseModel:
|
||||
raise NotImplementedError
|
||||
|
||||
async def get_voice_model_by_name(self, voice_model_name: str) -> models.BaseVoiceModel | None:
|
||||
"""
|
||||
Search voice model by name
|
||||
:param voice_model_name: String name
|
||||
:return: Voice model that match the name
|
||||
"""
|
||||
voice_models = await self.voice_models
|
||||
for voice_model in voice_models.models:
|
||||
if voice_model.voice_name == voice_model_name:
|
||||
return voice_model
|
||||
|
||||
async def get_list_voice_models_by_fields(
|
||||
self, fields: models.TTSSearchVoiceRequestModel
|
||||
) -> list[models.AVAILABLE_MODELS_TYPE]:
|
||||
"""
|
||||
Search voice model by fields
|
||||
:param fields: Any fields from TTSSearchVoiceRequestModel
|
||||
:return: All voice models that match the fields
|
||||
"""
|
||||
fields_dump = fields.model_dump(exclude_none=True)
|
||||
voice_models_response = []
|
||||
voice_models = await self.voice_models
|
||||
for voice_model in voice_models.models:
|
||||
for field, field_value in fields_dump.items():
|
||||
if field == "languages": # language is a list
|
||||
language_names: set[str] = {item.name for item in field_value}
|
||||
voice_model_language_names: set[str] = {item.name for item in voice_model.languages}
|
||||
if language_names.issubset(voice_model_language_names):
|
||||
continue
|
||||
break
|
||||
voice_model_dump = voice_model.model_dump()
|
||||
if voice_model_dump[field] != field_value.name:
|
||||
break
|
||||
else:
|
||||
voice_models_response.append(voice_model)
|
||||
return voice_models_response # type: ignore[reportUnknownVariableType]
|
||||
42
src/assistant/lib/tts/repositories/eleven_labs.py
Normal file
42
src/assistant/lib/tts/repositories/eleven_labs.py
Normal file
@@ -0,0 +1,42 @@
|
||||
import typing
|
||||
|
||||
import lib.app.split_settings as app_split_settings
|
||||
import lib.clients as clients
|
||||
import lib.models as models
|
||||
import lib.tts.repositories.base as tts_repositories_base
|
||||
|
||||
|
||||
class TTSElevenLabsRepository(tts_repositories_base.TTSBaseRepository):
|
||||
def __init__(
|
||||
self,
|
||||
tts_settings: app_split_settings.TTSElevenLabsSettings,
|
||||
client: clients.AsyncHttpClient,
|
||||
is_models_from_api: bool = False,
|
||||
):
|
||||
self.tts_settings = tts_settings
|
||||
super().__init__(client, is_models_from_api)
|
||||
|
||||
@property
|
||||
async def voice_models(self) -> models.ElevenLabsListVoiceModelsModel:
|
||||
if self.is_models_from_api:
|
||||
return models.ElevenLabsListVoiceModelsModel.from_api(await self.get_all_models_dict_from_api())
|
||||
return models.ElevenLabsListVoiceModelsModel()
|
||||
|
||||
async def get_all_models_dict_from_api(self) -> list[dict[str, typing.Any]]:
|
||||
response = await self.http_client.get("/models")
|
||||
return response.json()
|
||||
|
||||
async def get_audio_as_bytes(self, request: models.TTSCreateRequestModel) -> models.TTSCreateResponseModel:
|
||||
if not isinstance(request.voice_model, models.ElevenLabsVoiceModel):
|
||||
raise ValueError("ElevenLabs TTS support only ElevenLabsVoiceModel")
|
||||
response = await self.http_client.post(
|
||||
f"/text-to-speech/{self.tts_settings.default_voice_id}",
|
||||
json={"text": request.text, "model_id": request.voice_model.voice_id},
|
||||
)
|
||||
return models.TTSCreateResponseModel(audio_content=response.content)
|
||||
|
||||
async def get_voice_models_by_fields(
|
||||
self, fields: models.TTSSearchVoiceRequestModel
|
||||
) -> models.ElevenLabsListVoiceModelsModel:
|
||||
list_voice_models = await self.get_list_voice_models_by_fields(fields)
|
||||
return models.ElevenLabsListVoiceModelsModel(models=list_voice_models) # type: ignore
|
||||
48
src/assistant/lib/tts/repositories/yandex.py
Normal file
48
src/assistant/lib/tts/repositories/yandex.py
Normal file
@@ -0,0 +1,48 @@
|
||||
import logging
|
||||
|
||||
import lib.app.split_settings as app_split_settings
|
||||
import lib.clients as clients
|
||||
import lib.models as models
|
||||
import lib.tts.repositories.base as tts_repositories_base
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class TTSYandexRepository(tts_repositories_base.TTSBaseRepository):
|
||||
def __init__(
|
||||
self,
|
||||
tts_settings: app_split_settings.TTSYandexSettings,
|
||||
client: clients.AsyncHttpClient,
|
||||
is_models_from_api: bool = False,
|
||||
):
|
||||
self.tts_settings = tts_settings
|
||||
if is_models_from_api:
|
||||
logger.warning("Yandex TTS doesn't support getting models from API")
|
||||
super().__init__(client, is_models_from_api=False)
|
||||
|
||||
@property
|
||||
async def voice_models(self) -> models.YandexListVoiceModelsModel:
|
||||
return models.YandexListVoiceModelsModel()
|
||||
|
||||
async def get_audio_as_bytes(self, request: models.TTSCreateRequestModel) -> models.TTSCreateResponseModel:
|
||||
if not isinstance(request.voice_model, models.YandexVoiceModel):
|
||||
raise ValueError("Yandex TTS support only YandexVoiceModel")
|
||||
data = {
|
||||
"text": request.text,
|
||||
"lang": request.voice_model.languages[0].value,
|
||||
"voice": request.voice_model.voice_id,
|
||||
"emotion": request.voice_model.role,
|
||||
"format": self.tts_settings.audio_format,
|
||||
"sampleRateHertz": self.tts_settings.sample_rate_hertz,
|
||||
}
|
||||
response = await self.http_client.post(
|
||||
"/tts:synthesize",
|
||||
data=data,
|
||||
)
|
||||
return models.TTSCreateResponseModel(audio_content=response.content)
|
||||
|
||||
async def get_voice_models_by_fields(
|
||||
self, fields: models.TTSSearchVoiceRequestModel
|
||||
) -> models.YandexListVoiceModelsModel:
|
||||
list_voice_models = await self.get_list_voice_models_by_fields(fields)
|
||||
return models.YandexListVoiceModelsModel(models=list_voice_models) # type: ignore
|
||||
33
src/assistant/lib/tts/services.py
Normal file
33
src/assistant/lib/tts/services.py
Normal file
@@ -0,0 +1,33 @@
|
||||
import lib.models as _models
|
||||
import lib.tts.models as tts_models
|
||||
|
||||
|
||||
class TTSService:
|
||||
def __init__(
|
||||
self,
|
||||
repositories: dict[_models.VoiceModelProvidersEnum, tts_models.TTSRepositoryProtocol],
|
||||
):
|
||||
self.repositories = repositories
|
||||
|
||||
async def get_audio_as_bytes(self, request: _models.TTSCreateRequestModel) -> _models.TTSCreateResponseModel:
|
||||
model = request.voice_model
|
||||
repository = self.repositories[model.provider]
|
||||
audio_response = await repository.get_audio_as_bytes(request)
|
||||
return audio_response
|
||||
|
||||
async def get_voice_model_by_name(self, voice_model_name: str) -> _models.BaseVoiceModel | None:
|
||||
for repository in self.repositories.values():
|
||||
voice_model = await repository.get_voice_model_by_name(voice_model_name)
|
||||
if voice_model:
|
||||
return voice_model
|
||||
raise ValueError("Voice model not found")
|
||||
|
||||
async def get_list_voice_models_by_fields(
|
||||
self, fields: _models.TTSSearchVoiceRequestModel
|
||||
) -> list[_models.AVAILABLE_MODELS_TYPE]:
|
||||
response_models: list[_models.AVAILABLE_MODELS_TYPE] = []
|
||||
for repository in self.repositories.values():
|
||||
voice_models = await repository.get_voice_models_by_fields(fields)
|
||||
if voice_models.models:
|
||||
response_models.extend(voice_models.models)
|
||||
return response_models
|
||||
Reference in New Issue
Block a user