.oO SearXNG Developer Documentation Oo.
Loading...
Searching...
No Matches
github_code.py
Go to the documentation of this file.
1# SPDX-License-Identifier: AGPL-3.0-or-lat_er
2"""GitHub code search with `search syntax`_ as described in `Constructing a
3search query`_ in the documentation of GitHub's REST API.
4
5.. _search syntax:
6 https://docs.github.com/en/search-github/getting-started-with-searching-on-github/understanding-the-search-syntax
7.. _Constructing a search query:
8 https://docs.github.com/en/rest/search/search?apiVersion=2022-11-28#constructing-a-search-query
9.. _Github REST API for code search:
10 https://docs.github.com/en/rest/search/search?apiVersion=2022-11-28#search-code
11.. _Github REST API auth for code search:
12 https://docs.github.com/en/rest/search/search?apiVersion=2022-11-28#search-code--fine-grained-access-tokens
13
14Configuration
15=============
16
17The engine has the following mandatory setting:
18
19- :py:obj:`ghc_auth`
20 Change the authentication method used when using the API, defaults to none.
21
22Optional settings are:
23
24- :py:obj:`ghc_highlight_matching_lines`
25 Control the highlighting of the matched text (turns off/on).
26- :py:obj:`ghc_strip_new_lines`
27 Strip new lines at the start or end of each code fragment.
28- :py:obj:`ghc_strip_whitespace`
29 Strip any whitespace at the start or end of each code fragment.
30- :py:obj:`ghc_insert_block_separator`
31 Add a `...` between each code fragment before merging them.
32
33.. code:: yaml
34
35 - name: github code
36 engine: github_code
37 shortcut: ghc
38 ghc_auth:
39 type: "none"
40
41 - name: github code
42 engine: github_code
43 shortcut: ghc
44 ghc_auth:
45 type: "personal_access_token"
46 token: "<token>"
47 ghc_highlight_matching_lines: true
48 ghc_strip_whitespace: true
49 ghc_strip_new_lines: true
50
51
52 - name: github code
53 engine: github_code
54 shortcut: ghc
55 ghc_auth:
56 type: "bearer"
57 token: "<token>"
58
59Implementation
60===============
61
62GitHub does not return the code line indices alongside the code fragment in the
63search API. Since these are not super important for the user experience all the
64code lines are just relabeled (starting from 1) and appended (a disjoint set of
65code blocks in a single file might be returned from the API).
66"""
67
68
69import typing as t
70from urllib.parse import urlencode
71
72from searx.result_types import EngineResults
73from searx.extended_types import SXNG_Response
74from searx.network import raise_for_httperror
75
76# about
77about = {
78 "website": 'https://github.com/',
79 "wikidata_id": 'Q364',
80 "official_api_documentation": 'https://docs.github.com/en/rest/search/search?apiVersion=2022-11-28#search-code',
81 "use_official_api": True,
82 "require_api_key": False,
83 "results": 'JSON',
84}
85
86# engine dependent config
87categories = ['code']
88
89
90search_url = 'https://api.github.com/search/code?sort=indexed&{query}&{page}'
91# https://docs.github.com/en/rest/search/search?apiVersion=2022-11-28#text-match-metadata
92accept_header = 'application/vnd.github.text-match+json'
93paging = True
94
95ghc_auth = {
96 "type": "none",
97 "token": "",
98}
99"""Change the method of authenticating to the github API.
100
101``type`` needs to be one of ``none``, ``personal_access_token``, or ``bearer``.
102When type is not `none` a token is expected to be passed as well in
103``auth.token``.
104
105If there is any privacy concerns about generating a token, one can use the API
106without authentication. The calls will be heavily rate limited, this is what the
107API returns on such calls::
108
109 API rate limit exceeded for <redacted ip>.
110 (But here's the good news: Authenticated requests get a higher rate limit)
111
112The personal access token or a bearer for an org or a group can be generated [in
113the `GitHub settings`_.
114
115.. _GitHub settings:
116 https://docs.github.com/en/rest/search/search?apiVersion=2022-11-28#search-code--fine-grained-access-tokens
117"""
118
119ghc_highlight_matching_lines = True
120"""Highlight the matching code lines."""
121
122ghc_strip_new_lines = True
123"""Strip leading and trailing newlines for each returned fragment.
124Single file might return multiple code fragments.
125"""
126
127ghc_strip_whitespace = False
128"""Strip all leading and trailing whitespace for each returned fragment.
129Single file might return multiple code fragments. Enabling this might break
130code indentation.
131"""
132
133ghc_api_version = "2022-11-28"
134"""The version of the GitHub REST API.
135"""
136
137ghc_insert_block_separator = False
138"""Each file possibly consists of more than one code block that matches the
139search, if this is set to true, the blocks will be separated with ``...`` line.
140This might break the lexer and thus result in the lack of code highlighting.
141"""
142
143
144def request(query: str, params: dict[str, t.Any]) -> None:
145
146 params['url'] = search_url.format(query=urlencode({'q': query}), page=urlencode({'page': params['pageno']}))
147 params['headers']['Accept'] = accept_header
148 params['headers']['X-GitHub-Api-Version'] = ghc_api_version
149
150 if ghc_auth['type'] == "none":
151 # Without the auth header the query fails, so add a dummy instead.
152 # Queries without auth are heavily rate limited.
153 params['headers']['Authorization'] = "placeholder"
154 if ghc_auth['type'] == "personal_access_token":
155 params['headers']['Authorization'] = f"token {ghc_auth['token']}"
156 if ghc_auth['type'] == "bearer":
157 params['headers']['Authorization'] = f"Bearer {ghc_auth['token']}"
158
159 params['raise_for_httperror'] = False
160
161
162def extract_code(code_matches: list[dict[str, t.Any]]) -> tuple[list[str], set[int]]:
163 """
164 Iterate over multiple possible matches, for each extract a code fragment.
165 Github additionally sends context for _word_ highlights; pygments supports
166 highlighting lines, as such we calculate which lines to highlight while
167 traversing the text.
168 """
169 lines: list[str] = []
170 highlighted_lines_index: set[int] = set()
171
172 for i, match in enumerate(code_matches):
173 if i > 0 and ghc_insert_block_separator:
174 lines.append("...")
175 buffer: list[str] = []
176 highlight_groups = [highlight_group['indices'] for highlight_group in match['matches']]
177
178 code: str = match['fragment']
179 original_code_lenght = len(code)
180
181 if ghc_strip_whitespace:
182 code = code.lstrip()
183 if ghc_strip_new_lines:
184 code = code.lstrip("\n")
185
186 offset = original_code_lenght - len(code)
187
188 if ghc_strip_whitespace:
189 code = code.rstrip()
190 if ghc_strip_new_lines:
191 code = code.rstrip("\n")
192
193 for i, letter in enumerate(code):
194 if len(highlight_groups) > 0:
195 # the API ensures these are sorted already, and we have a
196 # guaranteed match in the code (all indices are in the range 0
197 # and len(fragment)), so only check the first highlight group
198 [after, before] = highlight_groups[0]
199 if after <= (i + offset) < before:
200 # pygments enumerates lines from 1, highlight the next line
201 highlighted_lines_index.add(len(lines) + 1)
202 highlight_groups.pop(0)
203
204 if letter == "\n":
205 lines.append("".join(buffer))
206 buffer = []
207 continue
208
209 buffer.append(letter)
210 lines.append("".join(buffer))
211 return lines, highlighted_lines_index
212
213
214def response(resp: SXNG_Response) -> EngineResults:
215 res = EngineResults()
216
217 if resp.status_code == 422:
218 # on a invalid search term the status code 422 "Unprocessable Content"
219 # is returned / e.g. search term is "user: foo" instead "user:foo"
220 return res
221 # raise for other errors
222 raise_for_httperror(resp)
223
224 for item in resp.json().get('items', []):
225 repo: dict[str, str] = item['repository'] # pyright: ignore[reportAny]
226 text_matches: list[dict[str, str]] = item['text_matches'] # pyright: ignore[reportAny]
227 # ensure picking only the code contents in the blob
228 code_matches = [
229 match for match in text_matches if match["object_type"] == "FileContent" and match["property"] == "content"
230 ]
231 lines, highlighted_lines_index = extract_code(code_matches)
232 if not ghc_highlight_matching_lines:
233 highlighted_lines_index: set[int] = set()
234
235 res.add(
236 res.types.Code(
237 url=item["html_url"], # pyright: ignore[reportAny]
238 title=f"{repo['full_name']} ยท {item['name']}",
239 filename=f"{item['path']}",
240 content=repo['description'],
241 repository=repo['html_url'],
242 codelines=[(i + 1, line) for (i, line) in enumerate(lines)],
243 hl_lines=highlighted_lines_index,
244 strip_whitespace=ghc_strip_whitespace,
245 strip_new_lines=ghc_strip_new_lines,
246 )
247 )
248
249 return res
None request(str query, dict[str, t.Any] params)
EngineResults response(SXNG_Response resp)
tuple[list[str], set[int]] extract_code(list[dict[str, t.Any]] code_matches)