arxiv_8py_source.html

# SPDX-License-Identifier: AGPL-3.0-or-later

"""arXiv is a free distribution service and an open-access archive for nearly

2.4 million scholarly articles in the fields of physics, mathematics, computer

science, quantitative biology, quantitative finance, statistics, electrical

engineering and systems science, and economics.


The engine uses the `arXiv API`_.


.. _arXiv API: https://info.arxiv.org/help/api/user-manual.html

"""


import typing as t


from datetime import datetime

from urllib.parse import urlencode


from lxml import etree

from lxml.etree import XPath

from searx.utils import eval_xpath, eval_xpath_list, eval_xpath_getindex

from searx.result_types import EngineResults


if t.TYPE_CHECKING:

    from searx.extended_types import SXNG_Response

    from searx.search.processors import OnlineParams


about = {

    "website": "https://arxiv.org",

    "wikidata_id": "Q118398",

    "official_api_documentation": "https://info.arxiv.org/help/api/user-manual.html",

    "use_official_api": True,

    "require_api_key": False,

    "results": "XML-RSS",

}


categories = ["science", "scientific publications"]

paging = True

arxiv_max_results = 10

arxiv_search_prefix = "all"

"""Search fields, for more details see, `Details of Query Construction`_.


.. _Details of Query Construction:

   https://info.arxiv.org/help/api/user-manual.html#51-details-of-query-construction

"""


base_url = "https://export.arxiv.org/api/query"

"""`arXiv API`_ URL, for more details see Query-Interface_


.. _Query-Interface: https://info.arxiv.org/help/api/user-manual.html#_query_interface

"""


arxiv_namespaces = {

    "atom": "http://www.w3.org/2005/Atom",

    "arxiv": "http://arxiv.org/schemas/atom",

}

xpath_entry = XPath("//atom:entry", namespaces=arxiv_namespaces)

xpath_title = XPath(".//atom:title", namespaces=arxiv_namespaces)

xpath_id = XPath(".//atom:id", namespaces=arxiv_namespaces)

xpath_summary = XPath(".//atom:summary", namespaces=arxiv_namespaces)

xpath_author_name = XPath(".//atom:author/atom:name", namespaces=arxiv_namespaces)

xpath_doi = XPath(".//arxiv:doi", namespaces=arxiv_namespaces)

xpath_pdf = XPath(".//atom:link[@title='pdf']", namespaces=arxiv_namespaces)

xpath_published = XPath(".//atom:published", namespaces=arxiv_namespaces)

xpath_journal = XPath(".//arxiv:journal_ref", namespaces=arxiv_namespaces)

xpath_category = XPath(".//atom:category/@term", namespaces=arxiv_namespaces)

xpath_comment = XPath("./arxiv:comment", namespaces=arxiv_namespaces)


def request(query: str, params: "OnlineParams") -> None:


    args = {

        "search_query": f"{arxiv_search_prefix}:{query}",

        "start": (params["pageno"] - 1) * arxiv_max_results,

        "max_results": arxiv_max_results,

    }

    params["url"] = f"{base_url}?{urlencode(args)}"


def response(resp: "SXNG_Response") -> EngineResults:


    res = EngineResults()


    dom = etree.fromstring(resp.content)

    for entry in eval_xpath_list(dom, xpath_entry):


        title: str = eval_xpath_getindex(entry, xpath_title, 0).text


        url: str = eval_xpath_getindex(entry, xpath_id, 0).text

        abstract: str = eval_xpath_getindex(entry, xpath_summary, 0).text


        authors: list[str] = [author.text for author in eval_xpath_list(entry, xpath_author_name)]


        #  doi

        doi_element = eval_xpath_getindex(entry, xpath_doi, 0, default=None)

        doi: str = "" if doi_element is None else doi_element.text


        # pdf

        pdf_element = eval_xpath_getindex(entry, xpath_pdf, 0, default=None)

        pdf_url: str = "" if pdf_element is None else pdf_element.attrib.get("href")


        # journal

        journal_element = eval_xpath_getindex(entry, xpath_journal, 0, default=None)

        journal: str = "" if journal_element is None else journal_element.text


        # tags

        tag_elements = eval_xpath(entry, xpath_category)

        tags: list[str] = [str(tag) for tag in tag_elements]


        # comments

        comments_elements = eval_xpath_getindex(entry, xpath_comment, 0, default=None)

        comments: str = "" if comments_elements is None else comments_elements.text


        publishedDate = datetime.strptime(eval_xpath_getindex(entry, xpath_published, 0).text, "%Y-%m-%dT%H:%M:%SZ")


        res.add(

            res.types.Paper(

                url=url,

                title=title,

                publishedDate=publishedDate,

                content=abstract,

                doi=doi,

                authors=authors,

                journal=journal,

                tags=tags,

                comments=comments,

                pdf_url=pdf_url,

            )

        )


    return res


searx.result_types.EngineResults
Definition __init__.py:67

searx.engines.arxiv.request
None request(str query, "OnlineParams" params)
Definition arxiv.py:68

searx.engines.arxiv.response
EngineResults response("SXNG_Response" resp)
Definition arxiv.py:78

searx.extended_types
Definition extended_types.py:1

searx.result_types
Definition __init__.py:1

searx.search.processors
Definition __init__.py:1

searx.utils
Definition utils.py:1