Monday, November 08, 2010 at 12:03 AM.

system.verbs.builtins.html.neuterJavaScript

on neuterJavaScript (s, legalProtocolSchemes=nil) {
	<<Neuter JavaScript in a string; neuter <script> tags, event handlers, and javascript: (and similar) protocols.
		<<Changes:
			<<2/10/02; 11:51:27 PM by JES
				<<Fixed a bug where HTML attribute values which contained the string "on" would cause incorrect neutering of the text. Fixed a bug where if there were more than one on-event attribute, only the first would be neutered.
			<<12/28/01; 4:20:30 AM by JES
				<<Fixed a bug where HTML comments which contained the string "on" would not be closed properly, resulting in incorrectly formatted HTML.
			<<2/23/01; 7:40:23 PM by JES
				<<Fixed a bug where an un-closed tag would cause a script error, array index out of range.
			<<2/20/01; 4:52:06 PM by PBS
				<<Optimization: fast return if the string does not contain a < character.
			<<2/20/01; 4:33:17 PM by JES
				<<Fixed a bug where quote characters inside event handlers could potentially allow arbitrary text to pass through this filter.
			<<2/20/01; 12:23:53 PM by JES
				<<Instead of neutering event handlers by inserting an 'x' character, remove event handlers completely.
				<<Neuter illegal protocol schemes in all applicable attributes, not just href's.
				<<Instead of neutering attributes when the protocol scheme is illegal, neuter the entire tag by replacing the opening < with <.
			<<2/19/01; 5:26:22 PM by PBS
				<<Removed flNeuterEventHandlers and flNeuterScriptTags parameters.
				<<Made legalProtocolSchemes an optional parameter. If not supplied, use the default list. The caller can either not supply it or over-ride it entirely.
			<<2/19/01; 2:29:55 PM by JES
				<<Neuter all illegal protocol schemes from <a href> tags.
			<<2/17/01; 1:24:12 AM by JES
				<<A whitespace character may be a space, a carriage return, or a linefeed character -- not just a space. Neuter <a href's which are javascript: protocol links, i.e. <a href="javascript:...>...</a>
			<<2/16/01; 9:06:43 PM by PBS
				<<Created.
	if not (s contains "<") { //fast return if no < character
		return (s)};
	
	local (ix = 1);
	local (whitespaceChars = {' ', '\t', '\r', '\n'}, quoteChars = {'\'', '\"'});
	
	if legalProtocolSchemes == nil { //if nil, use default list
		legalProtocolSchemes = {"http", "https", "ftp", "mailto", "file", "gopher", "news", "nntp", "telnet", "wais", "prospero", "nfs", "afs", "at", "binhex", "binhex4", "finger", "help", "about", "mac-binary", "mac-binhex", "mac-binhex40", "macbinary", "netphone", "nsl_neighborhood", "octet-stream", "pnm", "rtsp", "uue", "whois", "x-binary", "x-binhex40", "x-compress", "x-mac-binhex40", "x-macbinary", "x-netphone", "x-nsl_neighborhood", "x-sit", "x-stuffix", "x-tar", "zip"}};
	
	while ix <= sizeof (s) {
		if s[ix] == '<' and (string.mid (s, ix, 4) != "<!--") {
			local (i, ixstarttagname = ix, ct);
			local (flinquotes = false, flinsinglequotes = false, flindoublequotes = false);
			for i = ix to sizeof (s) {
				if ((s [i] == '>') and (not flinquotes)) or (i == sizeof (s)) {
					bundle { //neuter script tags
						local (tagname = string.mid (s, ix + 1, i - ix - 1));
						if tagname beginswith "/" {
							tagname = string.delete (tagname, 1, 1)};
						tagname = string.trimWhiteSpace (tagname);
						if tagname contains ' ' { //ignore any attributes
							tagname = string.nthfield (tagname, ' ', 1)};
						if string.lower (tagname) == "script" {
							s = string.delete (s, ix, 1); //delete the <
							s = string.insert ("<", s, ix)}; //replace with <
						break}};
				if s [i] == '\'' {
					flinsinglequotes = not flinsinglequotes};
				if s [i] == '\"' {
					flindoublequotes = not flindoublequotes};
				if flinsinglequotes or flindoublequotes {
					flinquotes = true}
				else {
					flinquotes = false};
				ct = sizeOf (s);
				bundle { //neuter event handlers
					if i < ct - 3 {
						if whitespaceChars contains s[i] { //02/17/01 JES: whitespace can be \t, \r or \n
							if string.lower (string.mid (s, i +1, 2)) == "on" { //an event handler?
								if string.isAlpha (string.mid (s, i + 3, 1)) {
									local (eventend = i + 1);
									while eventend <= sizeOf (s) { //find the '=' character
										if s[eventend] == '=' {
											break};
										eventend++};
									eventend++;
									while eventend <= sizeOf (s) { //find the first non-whitespace character after the '='
										if not (whitespaceChars contains s[eventend]) {
											break};
										eventend++};
									local (quotechar);
									if eventend <= sizeOf (s) {
										if (s[eventend] == '\'') or (s[eventend] == '\"') {
											quotechar = s[eventend];
											eventend++}};
									while eventend <= sizeOf (s) {
										if quotechar != nil {
											if (s[eventend] == quotechar) {
												if (s[eventend - 1] != '\\') {
													eventend++;
													break}}}
										else {
											if (whitespaceChars contains s[eventend]) or (s[eventend] == '>') {
												break}};
										eventend++};
									s = string.delete (s, i, eventend - i);
									ix--;
									break}}}}};
				if i < ct - 5 { //02/19/01 JES: neuter illegal protocol schemes in all applicable attributes
					if whitespaceChars contains s[i] {
						local (checkString = string.lower (string.mid (s, i + 1, 10)));
						case true {
							string.patternMatch ("action", checkString) > 0;
							string.patternMatch ("background", checkString) > 0;
							string.patternMatch ("code", checkString) > 0;
							string.patternMatch ("codebase", checkString) > 0;
							string.patternMatch ("data", checkString) > 0;
							string.patternMatch ("datasrc", checkString) > 0;
							string.patternMatch ("href", checkString) > 0;
							string.patternMatch ("lowsrc", checkString) > 0;
							string.patternMatch ("src", checkString) > 0 {
								local (schemestart = i + 1);
								while schemestart <= sizeOf (s) { //find the '=' character
									if s[schemestart] == '=' {
										break};
									schemestart++};
								schemestart++;
								local (foundquote = false);
								while schemestart <= sizeOf (s) { //find the protocol scheme start
									if string.isAlpha (s[schemestart]) {
										break};
									case s[schemestart] {
										'_';
										'-' {
											break}};
									schemestart++};
								local (schemeend = schemestart, flFoundScheme = true);
								while schemeend <= sizeOf (s) {
									if s[schemeend] == ':' { //the protocol scheme ends with a colon
										break};
									case s[schemeend] { //other characters which would end the href attribute
										'/';
										'\"';
										'\'' {
											flFoundScheme = false;
											break}};
									schemeend++};
								if flFoundScheme {
									local (scheme = string.lower (string.mid (s, schemestart, schemeend - schemestart)));
									if not (legalProtocolSchemes contains scheme) {
										s = string.delete (s, ix, 1); //delete the <
										s = string.insert ("<", s, ix); //replace with <
										break}}}}}}}};
		ix++};
	
	return (s)}
<<bundle //test code
	<<html.neuterJavaScript ("<script type=\"text\">Foo</script>")
		<<"<script type=\"text\">Foo</script>"
	<<html.neuterJavaScript ("\"<script type=\"text\">Foo</script>")
		<<"\"<script type=\"text\">Foo</script>"
	<<html.neuterJavaScript ("< script    type=\"text\">Foo</      script>")
		<<"< script    type=\"text\">Foo</      script>"
	<<html.neuterJavaScript ("Foobar \r\n< script    type=\"text\">Foo</      script>")
		<<"Foobar \r\n< script    type=\"text\">Foo</      script>"
	<<html.neuterJavaScript ("<hr> <a href=\"http://foo.com/\" onmouseover='bar'>Foobar</a><p>\r\n< script    type=\"text\">Foo</      script>")
		<<"<hr> <a href=\"http://foo.com/\">Foobar</a><p>\r\n< script    type=\"text\">Foo</      script>"
	<<html.neuterJavaScript ("<a onmouseout='foo'\">Bar</a>")
		<<"<a\">Bar</a>"
	<<html.neuterJavaScript ("<a onmouseout='foo>' onmouseover='bar'>Bar</a>")
		<<"<a>Bar</a>"
	<<html.neuterJavaScript ("<a onmouseout='foo'> onmouseover='bar' Bar</a>")
		<<"<a> onmouseover='bar' Bar</a>"
	<<html.neuterJavaScript ("<a href='onmouseover.html'> onmouseover='bar' Bar</a>")
		<<"<a href='onmouseover.html'> onmouseover='bar' Bar</a>"
	<<html.neuterJavaScript ("<a href=\"javascript:foo\">Bar</a>")
		<<"<a href=\"javascript:foo\">Bar</a>"
	<<html.neuterJavaScript ("<img src=\"javascript:foo\">")
		<<"<img src=\"javascript:foo\">"
	<<html.neuterJavaScript ("<a href=\"perlscript:foo\">Bar</a>")
		<<"<a href=\"perlscript:foo\">Bar</a>"
	<<html.neuterJavaScript ("<a href=\" _some-other_script : foo \">Bar</a>")
		<<"<a href=\" _some-other_script : foo \">Bar</a>"
	<<html.neuterJavaScript ("<a href=\" _some-other_script : foo \" style='bar:baz'>Bar</a>")
		<<"<a href=\" _some-other_script : foo \" style='bar:baz'>Bar</a>"
	<<html.neuterJavaScript ("<a href=\"http://foo\">Bar</a>")
		<<"<a href=\"http://foo\">Bar</a>"
	<<html.neuterJavaScript ("<a href=\"/foo\">Bar</a>")
		<<"<a href=\"/foo\">Bar</a>"
	<<html.neuterJavaScript ("<a href=\"../foo\">Bar</a>")
		<<"<a href=\"../foo\">Bar</a>"
	<<html.neuterJavaScript ("<a href=\"foo\">Bar</a>")
		<<"<a href=\"foo\">Bar</a>"
	<<html.neuterJavaScript ("<a href='foo' style=baz:bam>Bar</a>")
		<<"<a href='foo' style=baz:bam>Bar</a>"
	<<html.neuterJavaScript ("<a href=\"foo\'\" onmouseover=\'foo\'>Bar</a>")
		<<"<a href=\"foo'\">Bar</a>"
	<<html.neuterJavaScript ("<a href=\"http://foo\'\" onmouseover=\'foo\'>Bar</a>", {"ftp", "mailto"})
		<<"<a href=\"http://foo'\" onmouseover='foo'>Bar</a>"
	<<html.neuterJavaScript ("<hr onmouseover='window.alert (\"Message\")' foo>")
		<<"<hr foo>"
	<<html.neuterJavaScript ("<table><tr><td background=\"perlscript:deleteEverything\">Look out!!!</td></tr></table>")
		<<"<table><tr><td background=\"perlscript:deleteEverything\">Look out!!!</td></tr></table>"
	<<html.neuterJavaScript ("<hr onmouseover='window.alert (\"Mess\\'a>ge\")' foo>")
		<<"<hr foo>"
	<<html.neuterJavaScript ("<hr onmouseover=\"foo\">")
		<<"<hr>"
	<<html.neuterJavaScript ("<hr onmouseover='window.alert (\"Message\")'>") 
		<<"<hr>"
	<<html.neuterJavaScript ("foo bar baz")
		<<"foo bar baz"
	<<html.neuterJavaScript ("<!-- Get your groove on, Baby! -->")
		<<"<!-- Get your groove on, Baby! -->"
	<<html.neuterJavaScript("<a title='foo on bar' onmouseover='foo on bar' onmouseout='eatme()'>baz</a>")
		<<"<a title='foo on bar'>baz</a>"



This listing is for code that runs in the OPML Editor environment. I created these listings because I wanted the search engines to index it, so that when I want to look up something in my codebase I don't have to use the much slower search functionality in my object database. Dave Winer.