` Index - microcms plugin

Index (microcms-plugin)

The index plugin is used to index the words used in each page in the site. It implements microcms consume and process functions.

When the consume function is invoked, the file contents are parsed, stripping out HTML tags to leave the page text. After short and frequently occuring words are removed, the remaining words are placed into an index maintained by the plugin. As each page is consumed, an index of the entire site is accumulated, mapping words to the frequency with which they appear in each page.

The process function materialises the site-wide index as a set of files which a client-side search utility (see this site's search page for example) can easily retrieve and process in order to search for pages including one or specific terms. The process function is invoked when the following tag is encountered in a page.

<microcms function="index" path="..." />

When the process function is invoked, it returns an empty string (no new content is added to the calling page). However, the contents of the index that was created when microcms scanned the sites files, is converted to a set of JSON encoded files (using simplejson to encode python objects to JSON) representing an alphabetically ordered search tree. These files are saved to the data sub-directory of the directory containing the invoking page.

Code for the index plugin is shown below:

index.py
# index.py - a microcms plugin for indexing all the text in HTML files, and outputing the index to a JSON data file
# Copyright (c) 2009 Niall McCarroll  
# Distributed under the MIT/X11 License (http://www.mccarroll.net/snippets/license.txt)

import sys
import os
import os.path

import json

def consume(cfile,contents):
    title = cfile.outpath
    if cfile.metadata != None and 'microcms-title' in cfile.metadata:
        title = cfile.metadata['microcms-title']
    indexer.index(cfile.outpath,title,contents)    

def process(cfile,attrs):
    directory = os.path.join(os.path.split(cfile.path)[0],"data")
    indexer.output_dictionary(directory)
    return ""
    
class Indexer:
    
    def __init__(self):
        self.pdict = {}
        self.pages = {}
        self.titles = {}
        self.revpages = {}
        self.pagecounter = 1
        self.ignoretokens = frozenset(['and','then','the'])
        self.leafsize = 100

    def indexdir(self,path):

        for f in os.listdir(path):
            fpath = os.path.join(path,f)
            if os.path.isdir(fpath):
                self.indexdir(fpath)
            elif os.path.isfile(fpath):
                if fpath.endswith(".chtml"):
                    self.index(fpath,open(fpath).read())

    def index(self,page,title,txt):
        if not page in self.pages:
            self.pages[page] = self.pagecounter
            self.titles[self.pagecounter] = title
            self.revpages[self.pagecounter] = page
            self.pagecounter += 1        
        self.extend_dictionary(self.tokenise(self.filtertxt(self.parsehtml(txt))),self.pages[page])

    def parsehtml(self,txt):
        result = ""
        while(True):
            tagstart = txt.find("<")
            txttoken = txt[0:tagstart].strip()
            if txttoken != "":
                result += txttoken + " "
            if tagstart > -1:
                tagstop = txt.find(">")
                if tagstop > -1:
                    txt = txt[tagstop+1:]
                    continue
            break
        return result

    def tokenise(self,txt):
        tokens = txt.split(" ")
        return tokens

    def filtertxt(self,txt):
        newtxt = ''
        for ch in txt.lower():
            if ch >= 'a' and ch <= 'z':
                newtxt += ch
            else:
                newtxt += ' '
        return newtxt

    def usetoken(self,tok):
        if len(tok) <= 2:
            return False
        if tok in self.ignoretokens:
            return False
        return True

    class dict_entry:

        def __init__(self):
            self.pagecounts = {}

        def inc(self,page):
            if page in self.pagecounts:
                self.pagecounts[page] = self.pagecounts[page]+1
            else:
                self.pagecounts[page] = 1

        def __str__(self):
            result = ''
            for page in self.pagecounts:
                result += "\t"+str(page)+" : "+str(self.pagecounts[page])+"\n"
            return result

        def __repr__(self):
            return str(self.pagecounts)

    def extend_dictionary(self,tokens,page):

        for tok in tokens:
            if not self.usetoken(tok):
                continue
            if not tok in self.pdict:
                self.pdict[tok] = Indexer.dict_entry()
            self.pdict[tok].inc(page)

    class Node:
    
        def __init__(self,kdict):
            self.kdict = kdict

        def output(self,directory,depth,count):
            paths = {}
            for k in self.kdict:
                (count,paths[k]) = self.kdict[k].output(directory,depth+1,count)
                count += 1
            if depth == 0:
                fname = 'index.json'
            else:
                fname = "index"+str(depth)+"_"+str(count)+".json"
            f = open(os.path.join(directory,fname),"w+")
            f.write(json.dumps(paths))
            return (count+1,fname)

    class Leaf:

        def __init__(self,kdict):
            self.kdict = kdict

        def output(self,directory,depth,count):
            fname = "index"+str(depth)+"_"+str(count)+".json"
            f = open(os.path.join(directory,fname),"w+")
            f.write(json.dumps(self.kdict))
            return (count+1,fname)

    def partition(self,keys):
        result = {}
        if len(keys) > self.leafsize:
            mid = len(keys)/2
            k1 = keys[0]+"."+keys[mid]
            k2 = keys[mid+1]+"."+keys[-1]
            result[k1] = self.partition(keys[:mid+1])
            result[k2] = self.partition(keys[mid+1:])
            return Indexer.Node(result)
        else:
            k1 = keys[0]+"."+keys[-1]
            leafdata = {}
            for k in keys:
                leafdata[k] = self.pdict[k].pagecounts
            result[k1] = leafdata
            return Indexer.Leaf(result)            
    
    def output_dictionary(self,directory="."):
        pf = open(os.path.join(directory,"pages.json"),"w")
        pf.write(json.dumps(self.revpages))
        pf.close()
        pf = open(os.path.join(directory,"titles.json"),"w")
        pf.write(json.dumps(self.titles))
        pf.close()
        keys = self.pdict.keys()
        keys.sort()
        tree = self.partition(keys)
        tree.output(directory,0,0)
        
indexer = Indexer()

if __name__ == '__main__':
    path = sys.argv[1]
    indexer.indexdir(path)
    indexer.output_dictionary("index")
	
	

 

Leave a comment

Anti-Spam Check
Comment