Fix OG metadata scraping and improve workers

This commit is contained in:
Thomas Sileo 2022-11-13 13:00:22 +01:00
parent c3eb44add7
commit 793a939046
5 changed files with 37 additions and 7 deletions

View file

@ -1,12 +1,15 @@
import asyncio
import mimetypes
import re
import signal
from concurrent.futures import TimeoutError
from typing import Any
from urllib.parse import urlparse
import httpx
from bs4 import BeautifulSoup # type: ignore
from loguru import logger
from pebble import concurrent # type: ignore
from pydantic import BaseModel
from app import activitypub as ap
@ -29,7 +32,11 @@ class OpenGraphMeta(BaseModel):
site_name: str
@concurrent.process(timeout=5)
def _scrap_og_meta(url: str, html: str) -> OpenGraphMeta | None:
# Prevent SIGTERM to bubble up to the worker
signal.signal(signal.SIGTERM, signal.SIG_IGN)
soup = BeautifulSoup(html, "html5lib")
ogs = {
og.attrs["property"]: og.attrs.get("content")
@ -58,6 +65,10 @@ def _scrap_og_meta(url: str, html: str) -> OpenGraphMeta | None:
return OpenGraphMeta.parse_obj(raw)
def scrap_og_meta(url: str, html: str) -> OpenGraphMeta | None:
return _scrap_og_meta(url, html).result()
async def external_urls(
db_session: AsyncSession,
ro: ap_object.RemoteObject | OutboxObject | InboxObject,
@ -126,7 +137,10 @@ async def _og_meta_from_url(url: str) -> OpenGraphMeta | None:
return None
try:
return _scrap_og_meta(url, resp.text)
return scrap_og_meta(url, resp.text)
except TimeoutError:
logger.info(f"Timed out when scraping OG meta for {url}")
return None
except Exception:
logger.info(f"Failed to scrap OG meta for {url}")
return None