Monday, November 08, 2010 at 12:05 AM.
system.verbs.builtins.searchEngine.indexViaHTTP
on indexViaHTTP (siteName, url, username="", password="",cookiesOn=false, timeOutTicks=60*30, adrStopWords=nil) {
<<Fetch a page via HTTP and index it.
<<Return the entire, unprocessed text of the page if successful --
<<because a crawler might be calling this script,
<<and a crawler would need to extract URLs from the page text.
<<
<<Leave it up to the caller to save the GDB (call searchEngine.saveIndex).
local (server, port = 80, path, s);
local (adrIndex, adrPreviews);
local (pageText, pageHeader, pageTitle);
adrIndex = searchEngine.getIndexAddress (siteName);
adrPreviews = searchEngine.getPreviewsAddress (siteName);
if adrStopWords == nil {
adrStopWords = @searchEngine.data.stopWords};
bundle { //get server, port, and path from the URL
local (urlParts = string.urlSplit (url));
path = urlParts [3];
if not (path beginsWith "/") {
path = "/" + path};
server = urlParts [2];
local (sizeServer = sizeOf (server));
if server [sizeServer - 2] == ':' { //is there a port?
port = number (string.mid (server, sizeServer -1, 2))}};
bundle { //fetch the text of the page via HTTP
s = tcp.httpClient (server:server, port:port, path:path, username:username, password:password, cookiesOn:cookiesOn, timeOutTicks:timeOutTicks);
<<Separate the page text from the page header.
local (ix = string.patternMatch ("\r\n\r\n", s));
if ix < 1 {
pageHeader = s;
pageText = ""}
else {
pageHeader = string.mid (s, 1, ix);
pageText = string.mid (s, ix + 4, infinity)}};
bundle { //get the title and body text of the page
pageTitle = html.getOneTagValue (pageText, "title");
local (bodyText);
bodyText = html.getOneTagValue (pageText, "body");
if bodyText != "" {
pageText = bodyText}};
msg ("Search Engine: Indexing: " + url);
<<Create a preview for this page.
<<This should always be done before adding a page to the index.
searchEngine.createPreview (pageText, pageTitle, url, url, adrPreviews);
<<Add the page to the index.
<<It's okay to call this verb if the page has been indexed before.
searchEngine.indexPage (url, url, pageTitle, pageText, adrIndex, adrStopWords);
msg ("");
return (s)}
This listing is for code that runs in the OPML Editor environment. I created these listings because I wanted the search engines to index it, so that when I want to look up something in my codebase I don't have to use the much slower search functionality in my object database. Dave Winer.