Monday, November 08, 2010 at 12:07 AM.

system.verbs.builtins.xml.entityDecode

on entityDecode (s, flAlphaEntities = false, flSkipMalformedEntities = false, flDecodeHexEntities = false) {
	<<Changes
		<<2/6/09; 12:09:05 PM by DW
			<<— is an emdash, convert it to "--".
		<<1/21/09; 8:09:18 AM by DW
			<<Entities with values greater than 255 are showing up in Twitter posts and elsewhere. Time to start dealing with them. Also convert — to "--".
		<<5/5/07; 11:35:04 AM by DW
			<<Add ' as an alpha entity.
		<<8/16/04; 4:07:17 PM by JES
			<<New optional parameter, flDecodeHexEntities. If true, then entities in the form, &x2A; will be decoded to their equivalent character.
			<<Default is false, preserving existing behavior.
		<<12/12/01; 2:47:54 AM by JES
			<<New optional parameter, flSkipMalformedEntities. If true, then malformed numerical entities which either contain non-numeric characters or which lack an ending semicolon character don't cause an error. They're passed through as-is.
		<<11/29/01; 6:56:06 PM by PBS
			<<Skip entities > 255, since they can't be decoded into characters.
		<<11/14/2000; 5:44:31 PM by DW
			<<xml.entityDecode now takes an optional boolean parameter, default value false, that says whether "alpha" entities are decoded. This is necessary to avoid breakage. The four entities are: < > & "
		<<10/22/99; 3:35:56 PM by DW
			<<decode &#xxx; to the equivalent ASCII character.
	bundle { //start dealing with entities with values greater than 256 -- 1/21/09 by DW
		local (t);
		new (tabletype, @t);
		t.["“"] = "\"";
		t.["”"] = "\"";
		t.["’"] = "'";
		t.["—"] = "--"; //2/6/09 by DW
		s = string.multiplereplaceall (s, @t, false)};
	local (ix, ixremainder, numstring, ixsemi, remainder = s);
	loop {
		ixremainder = string.patternMatch ("&#", remainder);
		if ixremainder == 0 {
			break};
		ix = ixremainder + (sizeof (s) - sizeof (remainder));
		numstring = ""; ixsemi = sizeof (s);
		for i = ix + 2 to sizeof (s) {
			if s [i] == ';' {
				ixsemi = i;
				break};
			numstring = numstring + s [i]};
		local (num, flMalformed = false);
		try {
			num = number (numstring)}
		else { //hex entity or malformed
			if flDecodeHexEntities { //JES 8/16/04: decode hex entities
				if numstring[1] == "x" {
					try {
						num = number ("0" + numstring)}
					else {
						flMalformed = true}}
				else {
					if flAlphaEntities { //decode alpha entities inline with numeric entities to prevent double-decoding
						case numstring {
							"amp" {
								s = string.delete (s, ix, ixsemi - ix + 1);
								s = string.insert ("&", s, ix)};
							"lt" {
								s = string.delete (s, ix, ixsemi - ix + 1);
								s = string.insert ("<", s, ix)};
							"gt" {
								s = string.delete (s, ix, ixsemi - ix + 1);
								s = string.insert (">", s, ix)};
							"quot" {
								s = string.delete (s, ix, ixsemi - ix + 1);
								s = string.insert ("\"", s, ix)}}
						else {
							flMalformed = true}}
					else {
						flMalformed = true}}}
			else {
				flMalformed = true}};
		if flSkipMalformedEntities and flMalformed {
			remainder = string.mid (s, ix + 1, infinity);
			continue};
		if num < 256 { //PBS //11/29/01: skip large entities
			s = string.delete (s, ix, ixsemi - ix + 1);
			s = string.insert (char (num), s, ix)};
		remainder = string.mid (s, ix + 1, infinity)};
	bundle { //decode alpha entities
		if flAlphaEntities and (not flDecodeHexEntities) {
			<<JES 8/19/04: If flDecodeHexEntities is true, alpha-entities are decoded inline with numeric entities. Prevents double-decoding.
			s = string.replaceall (s, "<", "<");
			s = string.replaceall (s, ">", ">");
			s = string.replaceall (s, "&", "&");
			s = string.replaceall (s, """, "\"");
			s = string.replaceall (s, "'", "'");
			s = string.replaceall (s, "—", "--")}}; //1/21/09 by DW
	return (s)};
bundle { //test code
	dialog.alert (entityDecode ("Social Media “Experts” are the Cancer of Twitter (and Must Be Stopped)"))}
	<<xml.entityDecode ("Oh the <buzzing> of the bees & the "sycamore" trees.", true)
	<<xml.entityDecode ("A’PB")
	<<xml.entityDecode (xml.entityEncode ("Olejovář skvrna na \"Labi\"zřejmě do <Německa> & nedoplujeříjna 1999 13:49"), true)
	<<xml.entityDecode ("{Hello}", flSkipMalformedEntities:true)
		<<"{Hello}"
	<<xml.entityDecode ("{Hello}", flSkipMalformedEntities:false) //should be an error



This listing is for code that runs in the OPML Editor environment. I created these listings because I wanted the search engines to index it, so that when I want to look up something in my codebase I don't have to use the much slower search functionality in my object database. Dave Winer.