Monday, November 08, 2010 at 12:05 AM.

system.verbs.builtins.searchEngine.indexViaHTTP

on indexViaHTTP (siteName, url, username="", password="",cookiesOn=false, timeOutTicks=60*30, adrStopWords=nil) {
	
	<<Fetch a page via HTTP and index it.
		<<Return the entire, unprocessed text of the page if successful --
			<<because a crawler might be calling this script,
			<<and a crawler would need to extract URLs from the page text.
			<<
			<<Leave it up to the caller to save the GDB (call searchEngine.saveIndex).
	
	local (server, port = 80, path, s);
	local (adrIndex, adrPreviews);
	local (pageText, pageHeader, pageTitle);
	
	adrIndex = searchEngine.getIndexAddress (siteName);
	adrPreviews = searchEngine.getPreviewsAddress (siteName);
	
	if adrStopWords == nil {
		adrStopWords = @searchEngine.data.stopWords};
	
	bundle { //get server, port, and path from the URL
		local (urlParts = string.urlSplit (url));
		
		path = urlParts [3];
		if not (path beginsWith "/") {
			path = "/" + path};
		
		server = urlParts [2];
		local (sizeServer = sizeOf (server));
		if server [sizeServer - 2] == ':' { //is there a port?
			port = number (string.mid (server, sizeServer -1, 2))}};
	
	bundle { //fetch the text of the page via HTTP
		s = tcp.httpClient (server:server, port:port, path:path, username:username, password:password, cookiesOn:cookiesOn, timeOutTicks:timeOutTicks);
		
		<<Separate the page text from the page header.
		local (ix = string.patternMatch ("\r\n\r\n", s));
		
		if ix < 1 {
			pageHeader = s;
			pageText = ""}
		else {
			pageHeader = string.mid (s, 1, ix);
			pageText = string.mid (s, ix + 4, infinity)}};
	
	bundle { //get the title and body text of the page
		pageTitle = html.getOneTagValue (pageText, "title");
		
		local (bodyText);
		bodyText = html.getOneTagValue (pageText, "body");
		if bodyText != "" {
			pageText = bodyText}};
	
	msg ("Search Engine: Indexing: " + url);
	
	<<Create a preview for this page.
		<<This should always be done before adding a page to the index.
	searchEngine.createPreview (pageText, pageTitle, url, url, adrPreviews);
	
	<<Add the page to the index.
		<<It's okay to call this verb if the page has been indexed before.
	searchEngine.indexPage (url, url, pageTitle, pageText, adrIndex, adrStopWords);
	
	msg ("");
	
	return (s)}



This listing is for code that runs in the OPML Editor environment. I created these listings because I wanted the search engines to index it, so that when I want to look up something in my codebase I don't have to use the much slower search functionality in my object database. Dave Winer.