artemis/utils/iso_639.py
2024-03-01 20:51:07 +01:00

139 lines
3.2 KiB
Python

from __future__ import annotations
import json
import re
from csv import DictReader
from io import StringIO
from typing import TYPE_CHECKING, Literal, TypedDict
from utils.common import fuzzy_search, read_json
if TYPE_CHECKING:
from bot import Artemis
class Language(TypedDict):
id: str
part1: str
part2b: str
part2t: str
name: str
class NameResult(TypedDict):
name: str
source: str
class CodeResult(TypedDict):
name: str
part1: str
part2b: str
part2t: str
part3: str
SearchMethod = Literal["fuzzy", "strict-start", "strict"]
iso_639_1: list[Language] = []
iso_639_2b: list[Language] = []
iso_639_3: list[Language] = []
try:
iso_639_3 = read_json("data/iso_639_3.json")
iso_639_1 = [entry for entry in iso_639_3 if entry["part1"]]
iso_639_2b = [entry for entry in iso_639_3 if entry["part2b"]]
except FileNotFoundError:
pass
def _find_entry(seq: list[Language], lookup_key: str, query: str) -> Language | None:
return next((entry for entry in seq if entry[lookup_key] == query), None)
def get_language_name(code: str):
code = code.strip().lower()
if not code:
return None
found = None
# iso 639-1 alpha2 codes
if len(code) == 2:
found = _find_entry(iso_639_1, "part1", code)
if found:
found = found["name"]
# alpha3 codes
elif len(code) == 3:
# try iso-639-3 first
found = _find_entry(iso_639_3, "id", code)
if found:
found = found["name"]
else:
# try iso-639-2b
found = _find_entry(iso_639_2b, "part2b", code)
if found:
found = found["name"]
return found
def get_language_code(name: str, method: SearchMethod = "fuzzy") -> list[CodeResult] | None:
name = name.strip().lower()
if not name:
return None
if method == "fuzzy":
found = fuzzy_search(name, iso_639_3, "name", cutoff=80)
elif method == "strict-start":
found = [entry for entry in iso_639_3 if re.search(rf"^{name}\b", entry["name"], re.I)]
elif method == "strict":
found = [entry for entry in iso_639_3 if entry["name"].lower() == name]
if not found:
return None
return [
{
"name": entry["name"],
"part3": entry["id"],
"part2b": entry["part2b"],
"part2t": entry["part2t"],
"part1": entry["part1"],
}
for entry in found
]
async def build(bot: Artemis):
url = "https://iso639-3.sil.org/sites/iso639-3/files/downloads/iso-639-3.tab"
headers = {"User-Agent": bot.user_agent}
async with bot.session.get(url, headers=headers) as r:
data = await r.text()
data = DictReader(StringIO(data), delimiter="\t")
clean_data = []
for entry in data:
entry = {
k.lower(): v for k, v in entry.items() if k not in ("Scope", "Language_Type", "Comment")
}
for k in entry:
if not entry[k]:
entry[k] = None
entry["name"] = entry.pop("ref_name")
clean_data.append(entry)
with open("data/iso_639_3.json", "w") as f:
json.dump(clean_data, f)
global iso_639_3
iso_639_3 = clean_data
return len(clean_data)