etl module added

This commit is contained in:
garaev kamil 2025-12-05 22:59:33 +03:00
parent 0f619dd954
commit ff36173720
16 changed files with 1573 additions and 0 deletions

View file

@ -0,0 +1,235 @@
# anime_etl/db/repository.py
from __future__ import annotations
import json
from typing import Optional, Dict, List
import psycopg
from psycopg.rows import dict_row
from models import CanonicalTitle, Studio, Image
from images.downloader import ensure_image_downloaded
Conn = psycopg.AsyncConnection
def _choose_primary_name(
title_names: Dict[str, List[str]],
) -> Optional[tuple[str, str]]:
# (lang, name)
for lang in ("en", "romaji", "ja"):
variants = title_names.get(lang) or []
if variants:
return lang, variants[0]
for lang, variants in title_names.items():
if variants:
return lang, variants[0]
return None
async def get_or_create_image(
conn: Conn,
img: Optional[Image],
*,
subdir: str = "posters",
) -> Optional[int]:
if img is None or not img.image_path:
return None
# img.image_path сейчас — URL из AniList
url = img.image_path
# 1) решаем, куда кладём картинку, и если надо — скачиваем
rel_path = await ensure_image_downloaded(url, subdir=subdir)
async with conn.cursor(row_factory=dict_row) as cur:
# 2) пробуем найти уже существующую запись по относительному пути
await cur.execute(
"SELECT id FROM images WHERE image_path = %s",
(rel_path,),
)
row = await cur.fetchone()
if row:
return row["id"]
# 3) создаём новую запись
await cur.execute(
"""
INSERT INTO images (storage_type, image_path)
VALUES (%s, %s)
RETURNING id
""",
("local", rel_path),
)
row = await cur.fetchone()
return row["id"]
async def get_or_create_studio(
conn: Conn,
studio: Optional[Studio],
) -> Optional[int]:
if studio is None or not studio.name:
return None
async with conn.cursor(row_factory=dict_row) as cur:
# 1. Сначала ищем студию
await cur.execute(
"SELECT id, illust_id, studio_desc FROM studios WHERE studio_name = %s",
(studio.name,),
)
row = await cur.fetchone()
if row:
studio_id = row["id"]
illust_id = row["illust_id"]
studio_desc = row["studio_desc"]
# 1a. Если нет illust_id, а нам пришёл постер — докачаем и обновим
if illust_id is None and studio.poster is not None:
illust_id = await get_or_create_image(conn, studio.poster, subdir="studios")
await cur.execute(
"UPDATE studios SET illust_id = %s WHERE id = %s",
(illust_id, studio_id),
)
# 1b. Если нет описания, а enrich уже поднял description — обновим описание
if studio_desc is None and studio.description:
await cur.execute(
"UPDATE studios SET studio_desc = %s WHERE id = %s",
(studio.description, studio_id),
)
return studio_id
# 2. Студии нет — создаём
illust_id: Optional[int] = None
if studio.poster is not None:
illust_id = await get_or_create_image(conn, studio.poster, subdir="studios")
await cur.execute(
"""
INSERT INTO studios (studio_name, illust_id, studio_desc)
VALUES (%s, %s, %s)
RETURNING id
""",
(studio.name, illust_id, studio.description),
)
row = await cur.fetchone()
return row["id"]
async def find_title_id_by_name_and_year(
conn: Conn,
title_names: Dict[str, List[str]],
release_year: Optional[int],
) -> Optional[int]:
if release_year is None:
return None
pair = _choose_primary_name(title_names)
if not pair:
return None
lang, primary_name = pair
probe = json.dumps({lang: [primary_name]})
async with conn.cursor(row_factory=dict_row) as cur:
await cur.execute(
"""
SELECT id
FROM titles
WHERE release_year = %s
AND title_names @> %s::jsonb
LIMIT 1
""",
(release_year, probe),
)
row = await cur.fetchone()
if not row:
return None
return row["id"]
async def insert_title(
conn: Conn,
title: CanonicalTitle,
studio_id: Optional[int],
poster_id: Optional[int],
) -> int:
episodes_len_json = (
json.dumps(title.episodes_len) if title.episodes_len is not None else None
)
async with conn.cursor(row_factory=dict_row) as cur:
await cur.execute(
"""
INSERT INTO titles (
title_names,
studio_id,
poster_id,
title_status,
rating,
rating_count,
release_year,
release_season,
season,
episodes_aired,
episodes_all,
episodes_len
)
VALUES (
%s::jsonb,
%s,
%s,
%s,
%s,
%s,
%s,
%s,
%s,
%s,
%s,
%s::jsonb
)
RETURNING id
""",
(
json.dumps(title.title_names),
studio_id,
poster_id,
title.title_status,
title.rating,
title.rating_count,
title.release_year,
title.release_season,
title.season,
title.episodes_aired,
title.episodes_all,
episodes_len_json,
),
)
row = await cur.fetchone()
return row["id"]
async def insert_title_if_not_exists(
conn: Conn,
title: CanonicalTitle,
studio_id: Optional[int],
poster_id: Optional[int],
) -> int:
existing_id = await find_title_id_by_name_and_year(
conn,
title.title_names,
title.release_year,
)
if existing_id is not None:
return existing_id
return await insert_title(conn, title, studio_id, poster_id)