etl module added

This commit is contained in:
garaev kamil 2025-12-05 22:59:33 +03:00
parent 0f619dd954
commit ff36173720
16 changed files with 1573 additions and 0 deletions

View file

@ -0,0 +1,184 @@
# anime_etl/anilist_normalizer.py
from __future__ import annotations
from typing import Any, Dict, List, Optional
from models import Source, SourceTitle, Studio, Image, Tag
from utils.season_resolver import resolve_season_from_media
STATUS_MAP: Dict[str, str] = {
"FINISHED": "finished",
"RELEASING": "ongoing",
"NOT_YET_RELEASED": "planned",
"CANCELLED": "planned",
"HIATUS": "ongoing",
}
SEASON_MAP: Dict[str, str] = {
"WINTER": "winter",
"SPRING": "spring",
"SUMMER": "summer",
"FALL": "fall",
}
def _title_names(media: Dict[str, Any]) -> Dict[str, List[str]]:
t = media.get("title") or {}
native = t.get("native")
english = t.get("english")
romaji = t.get("romaji")
res: Dict[str, List[str]] = {}
if native:
res.setdefault("ja", []).append(native)
if english:
res.setdefault("en", []).append(english)
if romaji:
res.setdefault("romaji", []).append(romaji)
return res
def _studio(media: Dict[str, Any]) -> Optional[Studio]:
studios_nodes = (media.get("studios") or {}).get("nodes") or []
if not studios_nodes:
return None
name = studios_nodes[0].get("name")
if not name:
return None
return Studio(id=None, name=name, poster=None, description=None)
def _tags(media: Dict[str, Any]) -> List[Tag]:
genres = media.get("genres") or []
res: List[Tag] = []
for g in genres:
if g:
res.append(Tag(names={"en": g}))
return res
def _poster(media: Dict[str, Any]) -> Optional[Image]:
cover = media.get("coverImage") or {}
url = cover.get("extraLarge") or cover.get("large")
if not url:
return None
return Image(id=None, storage_type=None, image_path=url)
def _status(media: Dict[str, Any]) -> Optional[str]:
raw = media.get("status")
if not raw:
return None
return STATUS_MAP.get(raw)
def _rating(media: Dict[str, Any]) -> Optional[float]:
avg = media.get("averageScore")
if avg is None:
return None
try:
return float(avg) / 10.0
except (TypeError, ValueError):
return None
def _rating_count(media: Dict[str, Any]) -> Optional[int]:
pop = media.get("popularity")
if pop is None:
return None
try:
return int(pop)
except (TypeError, ValueError):
return None
def _year_and_season(media: Dict[str, Any]) -> tuple[Optional[int], Optional[str]]:
year = media.get("seasonYear")
raw_season = media.get("season")
release_year = year if isinstance(year, int) else None
release_season = None
if isinstance(raw_season, str):
release_season = SEASON_MAP.get(raw_season.upper())
return release_year, release_season
def _episodes(media: Dict[str, Any]) -> tuple[Optional[int], Optional[int]]:
episodes_all = media.get("episodes")
if not isinstance(episodes_all, int):
episodes_all = None
next_ep = media.get("nextAiringEpisode") or {}
ep_num = next_ep.get("episode") if isinstance(next_ep, dict) else None
if not isinstance(ep_num, int):
ep_num = None
# базовая логика
if ep_num is not None:
episodes_aired = ep_num - 1
else:
episodes_aired = episodes_all
# приведение к инварианту БД:
# либо обе NULL, либо обе заданы и episodes_aired <= episodes_all
if episodes_aired is None and episodes_all is None:
return None, None
if episodes_all is None and episodes_aired is not None:
episodes_all = episodes_aired
if episodes_aired is None and episodes_all is not None:
episodes_aired = episodes_all
if (
episodes_aired is not None
and episodes_all is not None
and episodes_aired > episodes_all
):
episodes_aired = episodes_all
return episodes_aired, episodes_all
def _episodes_len(media: Dict[str, Any]) -> Optional[Dict[str, float]]:
duration = media.get("duration")
if duration is None:
return None
try:
return {"default": float(duration)}
except (TypeError, ValueError):
return None
def normalize_media(media: Dict[str, Any]) -> SourceTitle:
"""AniList Media JSON -> наш SourceTitle."""
title_names = _title_names(media)
studio = _studio(media)
tags = _tags(media)
poster = _poster(media)
title_status = _status(media)
rating = _rating(media)
rating_count = _rating_count(media)
release_year, release_season = _year_and_season(media)
episodes_aired, episodes_all = _episodes(media)
episodes_len = _episodes_len(media)
season = resolve_season_from_media(media)
return SourceTitle(
source=Source.ANILIST,
external_id=str(media["id"]),
title_names=title_names,
studio=studio,
tags=tags,
poster=poster,
title_status=title_status,
rating=rating,
rating_count=rating_count,
release_year=release_year,
release_season=release_season,
season=season,
episodes_aired=episodes_aired,
episodes_all=episodes_all,
episodes_len=episodes_len,
)