acfun_8py_source.html

# SPDX-License-Identifier: AGPL-3.0-or-later

"""Acfun search engine for searxng"""


from urllib.parse import urlencode

import re

import json

from datetime import datetime, timedelta

from lxml import html


from searx.utils import extract_text


# Metadata

about = {

    "website": "https://www.acfun.cn/",

    "wikidata_id": "Q3077675",

    "use_official_api": False,

    "require_api_key": False,

    "results": "HTML",

    "language": "zh",

}


# Engine Configuration

categories = ["videos"]

paging = True


# Base URL

base_url = "https://www.acfun.cn"


def request(query, params):

    query_params = {"keyword": query, "pCursor": params["pageno"]}

    params["url"] = f"{base_url}/search?{urlencode(query_params)}"

    return params


def response(resp):

    results = []


    matches = re.findall(r'bigPipe\.onPageletArrive\‍((\{.*?\})\‍);', resp.text, re.DOTALL)

    if not matches:

        return results


    for match in matches:

        try:

            json_data = json.loads(match)

            raw_html = json_data.get("html", "")

            if not raw_html:

                continue


            tree = html.fromstring(raw_html)


            video_blocks = tree.xpath('//div[contains(@class, "search-video")]')

            if not video_blocks:

                continue


            for video_block in video_blocks:

                video_info = extract_video_data(video_block)

                if video_info and video_info["title"] and video_info["url"]:

                    results.append(video_info)


        except json.JSONDecodeError:

            continue


    return results


def extract_video_data(video_block):

    try:

        data_exposure_log = video_block.get('data-exposure-log')

        video_data = json.loads(data_exposure_log)


        content_id = video_data.get("content_id", "")

        title = video_data.get("title", "")


        url = f"{base_url}/v/ac{content_id}"

        iframe_src = f"{base_url}/player/ac{content_id}"


        create_time = extract_text(video_block.xpath('.//span[contains(@class, "info__create-time")]'))

        video_cover = extract_text(video_block.xpath('.//div[contains(@class, "video__cover")]/a/img/@src')[0])

        video_duration = extract_text(video_block.xpath('.//span[contains(@class, "video__duration")]'))

        video_intro = extract_text(video_block.xpath('.//div[contains(@class, "video__main__intro")]'))


        published_date = None

        if create_time:

            try:

                published_date = datetime.strptime(create_time.strip(), "%Y-%m-%d")

            except (ValueError, TypeError):

                pass


        length = None

        if video_duration:

            try:

                timediff = datetime.strptime(video_duration.strip(), "%M:%S")

                length = timedelta(minutes=timediff.minute, seconds=timediff.second)

            except (ValueError, TypeError):

                pass


        return {

            "title": title,

            "url": url,

            "content": video_intro,

            "thumbnail": video_cover,

            "length": length,

            "publishedDate": published_date,

            "iframe_src": iframe_src,

        }


    except (json.JSONDecodeError, AttributeError, TypeError, ValueError):

        return None


searx.engines.acfun.response
response(resp)
Definition acfun.py:36

searx.engines.acfun.request
request(query, params)
Definition acfun.py:30

searx.engines.acfun.extract_video_data
extract_video_data(video_block)
Definition acfun.py:67

searx.utils
Definition utils.py:1