aboutsummaryrefslogtreecommitdiff
path: root/bin
diff options
context:
space:
mode:
authorGöktürk Yüksek <gokturk@gentoo.org>2019-12-09 21:08:12 -0500
committerGöktürk Yüksek <gokturk@gentoo.org>2019-12-19 15:58:02 -0500
commit926e0d0855afa40f5dcbc16b1b7c66187afd7d73 (patch)
tree5164b61fbe2a769c1d5fbef09dea9f07a25a4b23 /bin
parentrename search_index.py to build_search_documents.py, and move it to bin/ (diff)
downloaddevmanual-926e0d0855afa40f5dcbc16b1b7c66187afd7d73.tar.gz
devmanual-926e0d0855afa40f5dcbc16b1b7c66187afd7d73.tar.bz2
devmanual-926e0d0855afa40f5dcbc16b1b7c66187afd7d73.zip
Rewrite the search functionality and extend the coverage
The current script only indexes the first <p> in a text.xml, and sometimes only partially if the text is interrupted by another tag such as <c/>. Modify build_search_documents.py such that: - It recursively traverses from chapter all the way down to subsubsection - Each <p>, <important>, <note>, <warning> is indexed separately - In the search results, the match entry will have the title in the form "Chapter[ -> Section[ -> Subsection[ -> Subsubsection]]]" Modify search.js such that: - The ref returned for a match is its index into "documents" array, which makes it possible to retrieve the document in O(1). Signed-off-by: Göktürk Yüksek <gokturk@gentoo.org>
Diffstat (limited to 'bin')
-rwxr-xr-xbin/build_search_documents.py112
1 files changed, 98 insertions, 14 deletions
diff --git a/bin/build_search_documents.py b/bin/build_search_documents.py
index 9af2753..3816fdb 100755
--- a/bin/build_search_documents.py
+++ b/bin/build_search_documents.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/python3
# Copyright 2019 Gentoo Authors
# Distributed under the terms of the GNU GPL version 2 or later
import json
@@ -6,19 +6,103 @@ import os.path
import sys
import xml.etree.ElementTree as ET
-files = sys.argv[1:]
-documents = []
-url_root = 'https://devmanual.gentoo.org/'
-for f in files:
- tree = ET.parse(f)
- root = tree.getroot()
- for chapter in root.findall('chapter'):
+def stringify_node(parent: ET.Element) -> str:
+ """Flatten this node and its immediate children to a string.
+
+ Combine the text and tail of this node, and any of its immediate
+ children, if there are any, into a flat string. The tag <d/> is a
+ special case that resolves to the dash ('-') character.
+
+ Keyword arguments:
+ parent -- the node to convert to a string
+
+ """
+ if parent.text:
+ text = parent.text.lstrip()
+ else:
+ text = str()
+
+ for child in parent.getchildren():
+ # The '<d/>' tag is simply a fancier '-' character
+ if child.tag == 'd':
+ text += '-'
+ if child.text:
+ text += child.text.lstrip()
+ if child.tail:
+ text += child.tail.rstrip()
+
+ text += parent.tail.rstrip()
+ return text.replace('\n', ' ')
+
+
+def process_node(documents: list, node: ET.Element, name: str, url: str) -> None:
+ """Recursively process a given node and its children based on tag values.
+
+ For the top level node <chapter>, extract the title and recurse
+ down to the children.
+ For the intermediary nodes with titles, such as <section>, update
+ the search result title and url, and recurse down.
+ For the terminal nodes, such as <p>, convert the contents of the
+ node to a string, and add it to the search documents.
+
+ Keyword arguments:
+ documents -- the search documents array
+ node -- the node to process
+ name -- the title to display for the search term match
+ url -- the url for the search term match in the document
+
+ """
+ if node.tag == 'chapter':
+ name = stringify_node(node.find('title'))
+
+ for child in node:
+ process_node(documents, child, name, url)
+ elif node.tag in ['section', 'subsection', 'subsubsection']:
+ title = stringify_node(node.find('title'))
+ name += ' -> ' + title
+ url = "{url_base}#{anchor}".format(
+ url_base=url.split('#')[0],
+ anchor=title.lower().replace(' ', '-'))
+
+ for child in node:
+ process_node(documents, child, name, url)
+ elif node.tag in ['body', 'guide']:
+ for child in node:
+ process_node(documents, child, name, url)
+ elif node.tag in ['p', 'important', 'note', 'warning']:
+ text = stringify_node(node)
+
+ documents.append({'id': len(documents),
+ 'name': name,
+ 'text': text,
+ 'url': url})
+ else:
+ pass
+
+
+def main(pathnames: list) -> None:
+ """The entry point of the script.
+
+ Keyword arguments:
+ pathnames -- a list of path names to process in sequential order
+ """
+ url_root = 'https://devmanual.gentoo.org/'
+ documents = []
+
+ for path in pathnames:
+ tree = ET.parse(path)
+ root = tree.getroot()
+
try:
- documents.append({"name": chapter.find('title').text,
- "text": chapter.find('body').find('p').text,
- "url": url_root + os.path.dirname(f) + '/'})
- except AttributeError:
- pass
+ url = url_root + os.path.dirname(path) + '/'
+
+ process_node(documents, root, None, url)
+ except:
+ raise
+
+ print('var documents = ' + json.dumps(documents) + ';')
+
-print('var documents = ' + json.dumps(documents) + ';')
+if __name__ in '__main__':
+ main(sys.argv[1:])