Monday, November 08, 2010 at 12:04 AM.
system.verbs.builtins.mainResponder.search.server.indexOnePage
on indexOnePage (url, title, pageText, adrStopWords=nil, indexName=nil) { <<Index one page. <<Changes: <<8/20/99; 11:41:41 AM by PBS <<Index all words that are not in the stop words table. Don't ignore numbers. Don't ignore words that are shorter than 3 characters. <<07/18/00; 10:46:01 PM by PBS <<Instead of using string.nthField, just keep deleting from the leading edge of the text, and index the first word. Performance optimization. local (lowerTitle = string.lower (title)); local (pageName); local (lowerAdr = string.lower (url)); local (lowerParent); local (i, j, numFields); local (flMessages = true); local (indexPath = mainResponder.search.utilities.getIndexPath (indexName)); local (adrIndex = @[indexPath].index); if adrStopWords == nil { adrStopWords = @searchEngine.data.stopWords}; on doMessage (s) { if flMessages { msg (s)}}; on getLastField (s) { local (ctfields = string.countFields (s, '/')); s = string.nthField (s, '/', ctfields); return (s)}; bundle { //get the page name pageName = url; pageName = getLastField (pageName)}; bundle { //get the name of the parent folder lowerParent = string.lower (url); lowerParent = string.popSuffix (lowerParent, '/'); lowerParent = getLastField (lowerParent)}; bundle { //clean the text before indexing pageText = searchEngine.cleanText (pageText)}; <<numFields = string.countFields (pageText, ' ') //count potential words local (ct = 0); <<for i = 1 to numFields //loop through every potential word <<local (oneWord = string.nthField (pageText, ' ', i)) << <<if oneWord beginsWith '#' <<continue //it's a directive << <<oneWord = string.dropNonAlphas (oneWord) <<oneWord = string.lower (oneWord) <<oneWord = string.popTrailing (oneWord, 's') //pop off trailing s's <<oneWord = string.trimWhiteSpace (oneWord) <<if oneWord == "" <<continue << <<PBS 8/20/99: commented this out -- we now index all words that are not in the stop words table. <<if sizeOf (oneWord) < 3 //we don't index very short words, except for "op" and "wp" <<if oneWord != "op" and oneWord != "wp" and oneWord != "ii" <<continue << <<if not (searchEngine.checkStopWords (oneWord, adrStopWords)) //check if this is on the list of words not to index <<continue //don't index << <<sys.systemTask () <<doMessage (oneWord) << <<local (firstLetter = string.mid (oneWord, 1, 1)) << <<PBS 8/20/99: a word can start with a number. At this point we have letters and numbers only, so this check can be eliminated. <<if char (firstLetter) < 'a' or char (firstLetter) > 'z' <<continue << <<local (adrLetter = @adrIndex^.[firstLetter]) //address of this letter's table in the index <<if not defined (adrLetter^) <<new (tableType, adrLetter) << <<local (adrWord = @adrLetter^.[oneWord]) //address of this word in the index <<if not defined (adrWord^) <<new (tableType, adrWord) << <<local (adrPageCount = @adrWord^.[url]) //address of the count for this page in this word <<if defined (adrPageCount^) <<adrPageCount^ = adrPageCount^ + 1 //this is a frequency count (plus the relevancy ranking, see below) <<else <<adrPageCount^ = 1 //first occurence of this word in the page << <<bundle //do relevancy ranking <<if lowerAdr contains oneWord //if the address of the page contains the word, add 100 <<if adrPageCount^ < 100 <<adrPageCount^ = adrPageCount^ + 100 <<if string.lower (pageName) contains oneWord //if the name of the page contains the word, add 500 <<if adrPageCount^ < 500 <<adrPageCount^ = adrPageCount^ + 500 <<if lowerParent == oneWord //if the name of the parent table equals the word and this is the default page, add 1000 <<if pageName contains "default" or pageName contains "index" <<if adrPageCount^ < 1000 <<adrPageCount^ = adrPageCount^ + 1000 <<if lowerTitle contains oneWord //if the title of the page contains the word, add 2000 <<if adrPageCount^ < 2000 <<adrPageCount^ = adrPageCount^ + 2000 << <<sys.systemTask () <<doMessage ("Indexing: " + url + ": " + oneWord) while sizeOf (pageText) > 0 { //PBS 07/18/00: loop through every potential word pageText = string.trimWhiteSpace (pageText); local (oneWord = string.nthField (pageText, ' ', 1)); local (wordSize = sizeOf (oneWord)); if oneWord beginsWith '#' { pageText = string.delete (pageText, 1, wordSize); continue}; //it's a directive oneWord = string.dropNonAlphas (oneWord); oneWord = string.lower (oneWord); oneWord = string.popTrailing (oneWord, 's'); //pop off trailing s's oneWord = string.trimWhiteSpace (oneWord); if oneWord == "" { pageText = string.delete (pageText, 1, wordSize); continue}; if not (searchEngine.checkStopWords (oneWord, adrStopWords)) { //check if this is on the list of words not to index pageText = string.delete (pageText, 1, wordSize); continue}; //don't index local (firstLetter = string.mid (oneWord, 1, 1)); local (adrLetter = @adrIndex^.[firstLetter]); //address of this letter's table in the index if not defined (adrLetter^) { new (tableType, adrLetter)}; local (adrWord = @adrLetter^.[oneWord]); //address of this word in the index if not defined (adrWord^) { new (tableType, adrWord)}; local (adrPageCount = @adrWord^.[url]); //address of the count for this page in this word if defined (adrPageCount^) { adrPageCount^ = adrPageCount^ + 1} //this is a frequency count (plus the relevancy ranking, see below) else { adrPageCount^ = 1}; //first occurence of this word in the page bundle { //do relevancy ranking if lowerAdr contains oneWord { //if the address of the page contains the word, add 100 if adrPageCount^ < 100 { adrPageCount^ = adrPageCount^ + 100}}; if string.lower (pageName) contains oneWord { //if the name of the page contains the word, add 500 if adrPageCount^ < 500 { adrPageCount^ = adrPageCount^ + 500}}; if lowerParent == oneWord { //if the name of the parent table equals the word and this is the default page, add 1000 if pageName contains "default" or pageName contains "index" { if adrPageCount^ < 1000 { adrPageCount^ = adrPageCount^ + 1000}}}; if lowerTitle contains oneWord { //if the title of the page contains the word, add 2000 if adrPageCount^ < 2000 { adrPageCount^ = adrPageCount^ + 2000}}}; pageText = string.delete (pageText, 1, wordSize); }; <<ct++ <<if ct > 500 //relax on occassion <<thread.sleepFor (0) <<ct = 0 msg (""); return (true)}
This listing is for code that runs in the OPML Editor environment. I created these listings because I wanted the search engines to index it, so that when I want to look up something in my codebase I don't have to use the much slower search functionality in my object database. Dave Winer.