etl module added
This commit is contained in:
parent
0f619dd954
commit
ff36173720
16 changed files with 1573 additions and 0 deletions
235
modules/anime_etl/db/repository.py
Normal file
235
modules/anime_etl/db/repository.py
Normal file
|
|
@ -0,0 +1,235 @@
|
|||
# anime_etl/db/repository.py
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from typing import Optional, Dict, List
|
||||
|
||||
import psycopg
|
||||
from psycopg.rows import dict_row
|
||||
|
||||
from models import CanonicalTitle, Studio, Image
|
||||
from images.downloader import ensure_image_downloaded
|
||||
|
||||
|
||||
Conn = psycopg.AsyncConnection
|
||||
|
||||
|
||||
def _choose_primary_name(
|
||||
title_names: Dict[str, List[str]],
|
||||
) -> Optional[tuple[str, str]]:
|
||||
# (lang, name)
|
||||
for lang in ("en", "romaji", "ja"):
|
||||
variants = title_names.get(lang) or []
|
||||
if variants:
|
||||
return lang, variants[0]
|
||||
|
||||
for lang, variants in title_names.items():
|
||||
if variants:
|
||||
return lang, variants[0]
|
||||
|
||||
return None
|
||||
|
||||
|
||||
async def get_or_create_image(
|
||||
conn: Conn,
|
||||
img: Optional[Image],
|
||||
*,
|
||||
subdir: str = "posters",
|
||||
) -> Optional[int]:
|
||||
if img is None or not img.image_path:
|
||||
return None
|
||||
|
||||
# img.image_path сейчас — URL из AniList
|
||||
url = img.image_path
|
||||
|
||||
# 1) решаем, куда кладём картинку, и если надо — скачиваем
|
||||
rel_path = await ensure_image_downloaded(url, subdir=subdir)
|
||||
|
||||
async with conn.cursor(row_factory=dict_row) as cur:
|
||||
# 2) пробуем найти уже существующую запись по относительному пути
|
||||
await cur.execute(
|
||||
"SELECT id FROM images WHERE image_path = %s",
|
||||
(rel_path,),
|
||||
)
|
||||
row = await cur.fetchone()
|
||||
if row:
|
||||
return row["id"]
|
||||
|
||||
# 3) создаём новую запись
|
||||
await cur.execute(
|
||||
"""
|
||||
INSERT INTO images (storage_type, image_path)
|
||||
VALUES (%s, %s)
|
||||
RETURNING id
|
||||
""",
|
||||
("local", rel_path),
|
||||
)
|
||||
row = await cur.fetchone()
|
||||
return row["id"]
|
||||
|
||||
|
||||
async def get_or_create_studio(
|
||||
conn: Conn,
|
||||
studio: Optional[Studio],
|
||||
) -> Optional[int]:
|
||||
if studio is None or not studio.name:
|
||||
return None
|
||||
|
||||
async with conn.cursor(row_factory=dict_row) as cur:
|
||||
# 1. Сначала ищем студию
|
||||
await cur.execute(
|
||||
"SELECT id, illust_id, studio_desc FROM studios WHERE studio_name = %s",
|
||||
(studio.name,),
|
||||
)
|
||||
row = await cur.fetchone()
|
||||
|
||||
if row:
|
||||
studio_id = row["id"]
|
||||
illust_id = row["illust_id"]
|
||||
studio_desc = row["studio_desc"]
|
||||
|
||||
# 1a. Если нет illust_id, а нам пришёл постер — докачаем и обновим
|
||||
if illust_id is None and studio.poster is not None:
|
||||
illust_id = await get_or_create_image(conn, studio.poster, subdir="studios")
|
||||
await cur.execute(
|
||||
"UPDATE studios SET illust_id = %s WHERE id = %s",
|
||||
(illust_id, studio_id),
|
||||
)
|
||||
|
||||
# 1b. Если нет описания, а enrich уже поднял description — обновим описание
|
||||
if studio_desc is None and studio.description:
|
||||
await cur.execute(
|
||||
"UPDATE studios SET studio_desc = %s WHERE id = %s",
|
||||
(studio.description, studio_id),
|
||||
)
|
||||
|
||||
return studio_id
|
||||
|
||||
# 2. Студии нет — создаём
|
||||
illust_id: Optional[int] = None
|
||||
if studio.poster is not None:
|
||||
illust_id = await get_or_create_image(conn, studio.poster, subdir="studios")
|
||||
|
||||
await cur.execute(
|
||||
"""
|
||||
INSERT INTO studios (studio_name, illust_id, studio_desc)
|
||||
VALUES (%s, %s, %s)
|
||||
RETURNING id
|
||||
""",
|
||||
(studio.name, illust_id, studio.description),
|
||||
)
|
||||
row = await cur.fetchone()
|
||||
return row["id"]
|
||||
|
||||
async def find_title_id_by_name_and_year(
|
||||
conn: Conn,
|
||||
title_names: Dict[str, List[str]],
|
||||
release_year: Optional[int],
|
||||
) -> Optional[int]:
|
||||
if release_year is None:
|
||||
return None
|
||||
|
||||
pair = _choose_primary_name(title_names)
|
||||
if not pair:
|
||||
return None
|
||||
|
||||
lang, primary_name = pair
|
||||
probe = json.dumps({lang: [primary_name]})
|
||||
|
||||
async with conn.cursor(row_factory=dict_row) as cur:
|
||||
await cur.execute(
|
||||
"""
|
||||
SELECT id
|
||||
FROM titles
|
||||
WHERE release_year = %s
|
||||
AND title_names @> %s::jsonb
|
||||
LIMIT 1
|
||||
""",
|
||||
(release_year, probe),
|
||||
)
|
||||
row = await cur.fetchone()
|
||||
|
||||
if not row:
|
||||
return None
|
||||
|
||||
return row["id"]
|
||||
|
||||
|
||||
async def insert_title(
|
||||
conn: Conn,
|
||||
title: CanonicalTitle,
|
||||
studio_id: Optional[int],
|
||||
poster_id: Optional[int],
|
||||
) -> int:
|
||||
episodes_len_json = (
|
||||
json.dumps(title.episodes_len) if title.episodes_len is not None else None
|
||||
)
|
||||
|
||||
async with conn.cursor(row_factory=dict_row) as cur:
|
||||
await cur.execute(
|
||||
"""
|
||||
INSERT INTO titles (
|
||||
title_names,
|
||||
studio_id,
|
||||
poster_id,
|
||||
title_status,
|
||||
rating,
|
||||
rating_count,
|
||||
release_year,
|
||||
release_season,
|
||||
season,
|
||||
episodes_aired,
|
||||
episodes_all,
|
||||
episodes_len
|
||||
)
|
||||
VALUES (
|
||||
%s::jsonb,
|
||||
%s,
|
||||
%s,
|
||||
%s,
|
||||
%s,
|
||||
%s,
|
||||
%s,
|
||||
%s,
|
||||
%s,
|
||||
%s,
|
||||
%s,
|
||||
%s::jsonb
|
||||
)
|
||||
RETURNING id
|
||||
""",
|
||||
(
|
||||
json.dumps(title.title_names),
|
||||
studio_id,
|
||||
poster_id,
|
||||
title.title_status,
|
||||
title.rating,
|
||||
title.rating_count,
|
||||
title.release_year,
|
||||
title.release_season,
|
||||
title.season,
|
||||
title.episodes_aired,
|
||||
title.episodes_all,
|
||||
episodes_len_json,
|
||||
),
|
||||
)
|
||||
row = await cur.fetchone()
|
||||
return row["id"]
|
||||
|
||||
|
||||
|
||||
async def insert_title_if_not_exists(
|
||||
conn: Conn,
|
||||
title: CanonicalTitle,
|
||||
studio_id: Optional[int],
|
||||
poster_id: Optional[int],
|
||||
) -> int:
|
||||
existing_id = await find_title_id_by_name_and_year(
|
||||
conn,
|
||||
title.title_names,
|
||||
title.release_year,
|
||||
)
|
||||
if existing_id is not None:
|
||||
return existing_id
|
||||
|
||||
return await insert_title(conn, title, studio_id, poster_id)
|
||||
Loading…
Add table
Add a link
Reference in a new issue