Monday, November 08, 2010 at 12:04 AM.
system.verbs.builtins.mainResponder.search.server.indexOnePage
on indexOnePage (url, title, pageText, adrStopWords=nil, indexName=nil) {
<<Index one page.
<<Changes:
<<8/20/99; 11:41:41 AM by PBS
<<Index all words that are not in the stop words table. Don't ignore numbers. Don't ignore words that are shorter than 3 characters.
<<07/18/00; 10:46:01 PM by PBS
<<Instead of using string.nthField, just keep deleting from the leading edge of the text, and index the first word. Performance optimization.
local (lowerTitle = string.lower (title));
local (pageName);
local (lowerAdr = string.lower (url));
local (lowerParent);
local (i, j, numFields);
local (flMessages = true);
local (indexPath = mainResponder.search.utilities.getIndexPath (indexName));
local (adrIndex = @[indexPath].index);
if adrStopWords == nil {
adrStopWords = @searchEngine.data.stopWords};
on doMessage (s) {
if flMessages {
msg (s)}};
on getLastField (s) {
local (ctfields = string.countFields (s, '/'));
s = string.nthField (s, '/', ctfields);
return (s)};
bundle { //get the page name
pageName = url;
pageName = getLastField (pageName)};
bundle { //get the name of the parent folder
lowerParent = string.lower (url);
lowerParent = string.popSuffix (lowerParent, '/');
lowerParent = getLastField (lowerParent)};
bundle { //clean the text before indexing
pageText = searchEngine.cleanText (pageText)};
<<numFields = string.countFields (pageText, ' ') //count potential words
local (ct = 0);
<<for i = 1 to numFields //loop through every potential word
<<local (oneWord = string.nthField (pageText, ' ', i))
<<
<<if oneWord beginsWith '#'
<<continue //it's a directive
<<
<<oneWord = string.dropNonAlphas (oneWord)
<<oneWord = string.lower (oneWord)
<<oneWord = string.popTrailing (oneWord, 's') //pop off trailing s's
<<oneWord = string.trimWhiteSpace (oneWord)
<<if oneWord == ""
<<continue
<<
<<PBS 8/20/99: commented this out -- we now index all words that are not in the stop words table.
<<if sizeOf (oneWord) < 3 //we don't index very short words, except for "op" and "wp"
<<if oneWord != "op" and oneWord != "wp" and oneWord != "ii"
<<continue
<<
<<if not (searchEngine.checkStopWords (oneWord, adrStopWords)) //check if this is on the list of words not to index
<<continue //don't index
<<
<<sys.systemTask ()
<<doMessage (oneWord)
<<
<<local (firstLetter = string.mid (oneWord, 1, 1))
<<
<<PBS 8/20/99: a word can start with a number. At this point we have letters and numbers only, so this check can be eliminated.
<<if char (firstLetter) < 'a' or char (firstLetter) > 'z'
<<continue
<<
<<local (adrLetter = @adrIndex^.[firstLetter]) //address of this letter's table in the index
<<if not defined (adrLetter^)
<<new (tableType, adrLetter)
<<
<<local (adrWord = @adrLetter^.[oneWord]) //address of this word in the index
<<if not defined (adrWord^)
<<new (tableType, adrWord)
<<
<<local (adrPageCount = @adrWord^.[url]) //address of the count for this page in this word
<<if defined (adrPageCount^)
<<adrPageCount^ = adrPageCount^ + 1 //this is a frequency count (plus the relevancy ranking, see below)
<<else
<<adrPageCount^ = 1 //first occurence of this word in the page
<<
<<bundle //do relevancy ranking
<<if lowerAdr contains oneWord //if the address of the page contains the word, add 100
<<if adrPageCount^ < 100
<<adrPageCount^ = adrPageCount^ + 100
<<if string.lower (pageName) contains oneWord //if the name of the page contains the word, add 500
<<if adrPageCount^ < 500
<<adrPageCount^ = adrPageCount^ + 500
<<if lowerParent == oneWord //if the name of the parent table equals the word and this is the default page, add 1000
<<if pageName contains "default" or pageName contains "index"
<<if adrPageCount^ < 1000
<<adrPageCount^ = adrPageCount^ + 1000
<<if lowerTitle contains oneWord //if the title of the page contains the word, add 2000
<<if adrPageCount^ < 2000
<<adrPageCount^ = adrPageCount^ + 2000
<<
<<sys.systemTask ()
<<doMessage ("Indexing: " + url + ": " + oneWord)
while sizeOf (pageText) > 0 { //PBS 07/18/00: loop through every potential word
pageText = string.trimWhiteSpace (pageText);
local (oneWord = string.nthField (pageText, ' ', 1));
local (wordSize = sizeOf (oneWord));
if oneWord beginsWith '#' {
pageText = string.delete (pageText, 1, wordSize);
continue}; //it's a directive
oneWord = string.dropNonAlphas (oneWord);
oneWord = string.lower (oneWord);
oneWord = string.popTrailing (oneWord, 's'); //pop off trailing s's
oneWord = string.trimWhiteSpace (oneWord);
if oneWord == "" {
pageText = string.delete (pageText, 1, wordSize);
continue};
if not (searchEngine.checkStopWords (oneWord, adrStopWords)) { //check if this is on the list of words not to index
pageText = string.delete (pageText, 1, wordSize);
continue}; //don't index
local (firstLetter = string.mid (oneWord, 1, 1));
local (adrLetter = @adrIndex^.[firstLetter]); //address of this letter's table in the index
if not defined (adrLetter^) {
new (tableType, adrLetter)};
local (adrWord = @adrLetter^.[oneWord]); //address of this word in the index
if not defined (adrWord^) {
new (tableType, adrWord)};
local (adrPageCount = @adrWord^.[url]); //address of the count for this page in this word
if defined (adrPageCount^) {
adrPageCount^ = adrPageCount^ + 1} //this is a frequency count (plus the relevancy ranking, see below)
else {
adrPageCount^ = 1}; //first occurence of this word in the page
bundle { //do relevancy ranking
if lowerAdr contains oneWord { //if the address of the page contains the word, add 100
if adrPageCount^ < 100 {
adrPageCount^ = adrPageCount^ + 100}};
if string.lower (pageName) contains oneWord { //if the name of the page contains the word, add 500
if adrPageCount^ < 500 {
adrPageCount^ = adrPageCount^ + 500}};
if lowerParent == oneWord { //if the name of the parent table equals the word and this is the default page, add 1000
if pageName contains "default" or pageName contains "index" {
if adrPageCount^ < 1000 {
adrPageCount^ = adrPageCount^ + 1000}}};
if lowerTitle contains oneWord { //if the title of the page contains the word, add 2000
if adrPageCount^ < 2000 {
adrPageCount^ = adrPageCount^ + 2000}}};
pageText = string.delete (pageText, 1, wordSize);
};
<<ct++
<<if ct > 500 //relax on occassion
<<thread.sleepFor (0)
<<ct = 0
msg ("");
return (true)}
This listing is for code that runs in the OPML Editor environment. I created these listings because I wanted the search engines to index it, so that when I want to look up something in my codebase I don't have to use the much slower search functionality in my object database. Dave Winer.