diff options
author | Göktürk Yüksek <gokturk@gentoo.org> | 2019-12-09 21:08:12 -0500 |
---|---|---|
committer | Göktürk Yüksek <gokturk@gentoo.org> | 2019-12-19 15:58:02 -0500 |
commit | 926e0d0855afa40f5dcbc16b1b7c66187afd7d73 (patch) | |
tree | 5164b61fbe2a769c1d5fbef09dea9f07a25a4b23 /bin | |
parent | rename search_index.py to build_search_documents.py, and move it to bin/ (diff) | |
download | devmanual-926e0d0855afa40f5dcbc16b1b7c66187afd7d73.tar.gz devmanual-926e0d0855afa40f5dcbc16b1b7c66187afd7d73.tar.bz2 devmanual-926e0d0855afa40f5dcbc16b1b7c66187afd7d73.zip |
Rewrite the search functionality and extend the coverage
The current script only indexes the first <p> in a text.xml, and
sometimes only partially if the text is interrupted by another tag
such as <c/>.
Modify build_search_documents.py such that:
- It recursively traverses from chapter all the way down to
subsubsection
- Each <p>, <important>, <note>, <warning> is indexed separately
- In the search results, the match entry will have the title in the
form "Chapter[ -> Section[ -> Subsection[ -> Subsubsection]]]"
Modify search.js such that:
- The ref returned for a match is its index into "documents" array,
which makes it possible to retrieve the document in O(1).
Signed-off-by: Göktürk Yüksek <gokturk@gentoo.org>
Diffstat (limited to 'bin')
-rwxr-xr-x | bin/build_search_documents.py | 112 |
1 files changed, 98 insertions, 14 deletions
diff --git a/bin/build_search_documents.py b/bin/build_search_documents.py index 9af2753..3816fdb 100755 --- a/bin/build_search_documents.py +++ b/bin/build_search_documents.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/python3 # Copyright 2019 Gentoo Authors # Distributed under the terms of the GNU GPL version 2 or later import json @@ -6,19 +6,103 @@ import os.path import sys import xml.etree.ElementTree as ET -files = sys.argv[1:] -documents = [] -url_root = 'https://devmanual.gentoo.org/' -for f in files: - tree = ET.parse(f) - root = tree.getroot() - for chapter in root.findall('chapter'): +def stringify_node(parent: ET.Element) -> str: + """Flatten this node and its immediate children to a string. + + Combine the text and tail of this node, and any of its immediate + children, if there are any, into a flat string. The tag <d/> is a + special case that resolves to the dash ('-') character. + + Keyword arguments: + parent -- the node to convert to a string + + """ + if parent.text: + text = parent.text.lstrip() + else: + text = str() + + for child in parent.getchildren(): + # The '<d/>' tag is simply a fancier '-' character + if child.tag == 'd': + text += '-' + if child.text: + text += child.text.lstrip() + if child.tail: + text += child.tail.rstrip() + + text += parent.tail.rstrip() + return text.replace('\n', ' ') + + +def process_node(documents: list, node: ET.Element, name: str, url: str) -> None: + """Recursively process a given node and its children based on tag values. + + For the top level node <chapter>, extract the title and recurse + down to the children. + For the intermediary nodes with titles, such as <section>, update + the search result title and url, and recurse down. + For the terminal nodes, such as <p>, convert the contents of the + node to a string, and add it to the search documents. + + Keyword arguments: + documents -- the search documents array + node -- the node to process + name -- the title to display for the search term match + url -- the url for the search term match in the document + + """ + if node.tag == 'chapter': + name = stringify_node(node.find('title')) + + for child in node: + process_node(documents, child, name, url) + elif node.tag in ['section', 'subsection', 'subsubsection']: + title = stringify_node(node.find('title')) + name += ' -> ' + title + url = "{url_base}#{anchor}".format( + url_base=url.split('#')[0], + anchor=title.lower().replace(' ', '-')) + + for child in node: + process_node(documents, child, name, url) + elif node.tag in ['body', 'guide']: + for child in node: + process_node(documents, child, name, url) + elif node.tag in ['p', 'important', 'note', 'warning']: + text = stringify_node(node) + + documents.append({'id': len(documents), + 'name': name, + 'text': text, + 'url': url}) + else: + pass + + +def main(pathnames: list) -> None: + """The entry point of the script. + + Keyword arguments: + pathnames -- a list of path names to process in sequential order + """ + url_root = 'https://devmanual.gentoo.org/' + documents = [] + + for path in pathnames: + tree = ET.parse(path) + root = tree.getroot() + try: - documents.append({"name": chapter.find('title').text, - "text": chapter.find('body').find('p').text, - "url": url_root + os.path.dirname(f) + '/'}) - except AttributeError: - pass + url = url_root + os.path.dirname(path) + '/' + + process_node(documents, root, None, url) + except: + raise + + print('var documents = ' + json.dumps(documents) + ';') + -print('var documents = ' + json.dumps(documents) + ';') +if __name__ in '__main__': + main(sys.argv[1:]) |