Rewrite the search functionality and extend the coverage

The current script only indexes the first <p> in a text.xml, and sometimes only partially if the text is interrupted by another tag such as <c/>. Modify build_search_documents.py such that: - It recursively traverses from chapter all the way down to subsubsection - Each <p>, <important>, <note>, <warning> is indexed separately - In the search results, the match entry will have the title in the form "Chapter[ -> Section[ -> Subsection[ -> Subsubsection]]]" Modify search.js such that: - The ref returned for a match is its index into "documents" array, which makes it possible to retrieve the document in O(1). Signed-off-by: Göktürk Yüksek <gokturk@gentoo.org>
author: Göktürk Yüksek <gokturk@gentoo.org> 2019-12-09 21:08:12 -0500
committer: Göktürk Yüksek <gokturk@gentoo.org> 2019-12-19 15:58:02 -0500
commit: 926e0d0855afa40f5dcbc16b1b7c66187afd7d73 (patch)
tree: 5164b61fbe2a769c1d5fbef09dea9f07a25a4b23 /bin
parent: rename search_index.py to build_search_documents.py, and move it to bin/ (diff)
download: devmanual-926e0d0855afa40f5dcbc16b1b7c66187afd7d73.tar.gz
devmanual-926e0d0855afa40f5dcbc16b1b7c66187afd7d73.tar.bz2
devmanual-926e0d0855afa40f5dcbc16b1b7c66187afd7d73.zip
1 files changed, 98 insertions, 14 deletions
diff --git a/bin/build_search_documents.py b/bin/build_search_documents.py
index 9af2753..3816fdb 100755
--- a/bin/build_search_documents.py
+++ b/bin/build_search_documents.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/python3
 # Copyright 2019 Gentoo Authors
 # Distributed under the terms of the GNU GPL version 2 or later
 import json
@@ -6,19 +6,103 @@ import os.path
 import sys
 import xml.etree.ElementTree as ET
 
-files = sys.argv[1:]
-documents = []
-url_root = 'https://devmanual.gentoo.org/'
 
-for f in files:
-    tree = ET.parse(f)
-    root = tree.getroot()
-    for chapter in root.findall('chapter'):
+def stringify_node(parent: ET.Element) -> str:
+    """Flatten this node and its immediate children to a string.
+
+    Combine the text and tail of this node, and any of its immediate
+    children, if there are any, into a flat string. The tag <d/> is a
+    special case that resolves to the dash ('-') character.
+
+    Keyword arguments:
+    parent -- the node to convert to a string
+
+    """
+    if parent.text:
+        text = parent.text.lstrip()
+    else:
+        text = str()
+
+    for child in parent.getchildren():
+        # The '<d/>' tag is simply a fancier '-' character
+        if child.tag == 'd':
+            text += '-'
+        if child.text:
+            text += child.text.lstrip()
+        if child.tail:
+            text += child.tail.rstrip()
+
+    text += parent.tail.rstrip()
+    return text.replace('\n', ' ')
+
+
+def process_node(documents: list, node: ET.Element, name: str, url: str) -> None:
+    """Recursively process a given node and its children based on tag values.
+
+    For the top level node <chapter>, extract the title and recurse
+    down to the children.
+    For the intermediary nodes with titles, such as <section>, update
+    the search result title and url, and recurse down.
+    For the terminal nodes, such as <p>, convert the contents of the
+    node to a string, and add it to the search documents.
+
+    Keyword arguments:
+    documents -- the search documents array
+    node -- the node to process
+    name -- the title to display for the search term match
+    url -- the url for the search term match in the document
+
+    """
+    if node.tag == 'chapter':
+        name = stringify_node(node.find('title'))
+
+        for child in node:
+            process_node(documents, child, name, url)
+    elif node.tag in ['section', 'subsection', 'subsubsection']:
+        title = stringify_node(node.find('title'))
+        name += ' -> ' + title
+        url = "{url_base}#{anchor}".format(
+            url_base=url.split('#')[0],
+            anchor=title.lower().replace(' ', '-'))
+
+        for child in node:
+            process_node(documents, child, name, url)
+    elif node.tag in ['body', 'guide']:
+        for child in node:
+            process_node(documents, child, name, url)
+    elif node.tag in ['p', 'important', 'note', 'warning']:
+        text = stringify_node(node)
+
+        documents.append({'id': len(documents),
+                          'name': name,
+                          'text': text,
+                          'url': url})
+    else:
+        pass
+
+
+def main(pathnames: list) -> None:
+    """The entry point of the script.
+
+    Keyword arguments:
+    pathnames -- a list of path names to process in sequential order
+    """
+    url_root = 'https://devmanual.gentoo.org/'
+    documents = []
+
+    for path in pathnames:
+        tree = ET.parse(path)
+        root = tree.getroot()
+
         try:
-            documents.append({"name": chapter.find('title').text,
-                "text": chapter.find('body').find('p').text,
-                 "url": url_root + os.path.dirname(f) + '/'})
-        except AttributeError:
-            pass
+            url = url_root + os.path.dirname(path) + '/'
+
+            process_node(documents, root, None, url)
+        except:
+            raise
+
+    print('var documents = ' + json.dumps(documents) + ';')
+
 
-print('var documents = ' + json.dumps(documents) + ';')
+if __name__ in '__main__':
+    main(sys.argv[1:])
author	Göktürk Yüksek <gokturk@gentoo.org>	2019-12-09 21:08:12 -0500
committer	Göktürk Yüksek <gokturk@gentoo.org>	2019-12-19 15:58:02 -0500
commit	926e0d0855afa40f5dcbc16b1b7c66187afd7d73 (patch)
tree	5164b61fbe2a769c1d5fbef09dea9f07a25a4b23 /bin
parent	rename search_index.py to build_search_documents.py, and move it to bin/ (diff)
download	devmanual-926e0d0855afa40f5dcbc16b1b7c66187afd7d73.tar.gz devmanual-926e0d0855afa40f5dcbc16b1b7c66187afd7d73.tar.bz2 devmanual-926e0d0855afa40f5dcbc16b1b7c66187afd7d73.zip