rewrote loadNonStandardEntities() to use a more reliable token representation for each character: "\uxxxx" notation.

This commit is contained in:
jim teeuwen 2009-12-02 21:44:02 +01:00
parent 664e4ae0d2
commit 379dfddd5b
1 changed files with 252 additions and 254 deletions

View File

@ -33,7 +33,6 @@ func UTF8ToHtml(token string) string {
return fmt.Sprintf("&#%d;", rune); return fmt.Sprintf("&#%d;", rune);
} }
/* /*
http://www.w3.org/TR/html4/sgml/entities.html http://www.w3.org/TR/html4/sgml/entities.html
@ -49,259 +48,258 @@ func UTF8ToHtml(token string) string {
error and abort the parsing. Hence the ability to supply this map. error and abort the parsing. Hence the ability to supply this map.
*/ */
func loadNonStandardEntities(em *map[string]string) { func loadNonStandardEntities(em *map[string]string) {
// Generic entities string([]uint8{160}); (*em)["pi"] = "\u03c0";
(*em)["nbsp"] = " "; (*em)["nabla"] = "\u2207";
(*em)["iexcl"] = "¡"; (*em)["isin"] = "\u2208";
(*em)["cent"] = "¢"; (*em)["loz"] = "\u25ca";
(*em)["pound"] = "£"; (*em)["prop"] = "\u221d";
(*em)["curren"] = "¤"; (*em)["para"] = "\u00b6";
(*em)["yen"] = "¥"; (*em)["Aring"] = "\u00c5";
(*em)["brvbar"] = "¦"; (*em)["euro"] = "\u20ac";
(*em)["sect"] = "§"; (*em)["sup3"] = "\u00b3";
(*em)["uml"] = "¨"; (*em)["sup2"] = "\u00b2";
(*em)["copy"] = "©"; (*em)["sup1"] = "\u00b9";
(*em)["ordf"] = "ª"; (*em)["prod"] = "\u220f";
(*em)["laquo"] = "«"; (*em)["gamma"] = "\u03b3";
(*em)["not"] = "¬"; (*em)["perp"] = "\u22a5";
(*em)["shy"] = "­"; (*em)["lfloor"] = "\u230a";
(*em)["reg"] = "®"; (*em)["fnof"] = "\u0192";
(*em)["macr"] = "¯"; (*em)["frasl"] = "\u2044";
(*em)["deg"] = "°"; (*em)["rlm"] = "\u200f";
(*em)["plusmn"] = "±"; (*em)["omega"] = "\u03c9";
(*em)["sup"] = "²"; (*em)["part"] = "\u2202";
(*em)["sup"] = "³"; (*em)["euml"] = "\u00eb";
(*em)["acute"] = "´"; (*em)["Kappa"] = "\u039a";
(*em)["micro"] = "µ"; (*em)["nbsp"] = "\u00a0";
(*em)["para"] = "¶"; (*em)["Eacute"] = "\u00c9";
(*em)["middot"] = "·"; (*em)["brvbar"] = "\u00a6";
(*em)["cedil"] = "¸"; (*em)["otimes"] = "\u2297";
(*em)["sup"] = "¹"; (*em)["ndash"] = "\u2013";
(*em)["ordm"] = "º"; (*em)["thinsp"] = "\u2009";
(*em)["raquo"] = "»"; (*em)["nu"] = "\u03bd";
(*em)["frac14"] = "¼"; (*em)["Upsilon"] = "\u03a5";
(*em)["frac12"] = "½"; (*em)["upsih"] = "\u03d2";
(*em)["frac34"] = "¾"; (*em)["raquo"] = "\u00bb";
(*em)["iquest"] = "¿"; (*em)["yacute"] = "\u00fd";
(*em)["Agrave"] = "À"; (*em)["delta"] = "\u03b4";
(*em)["Aacute"] = "Á"; (*em)["eth"] = "\u00f0";
(*em)["Acirc"] = "Â"; (*em)["supe"] = "\u2287";
(*em)["Atilde"] = "Ã"; (*em)["ne"] = "\u2260";
(*em)["Auml"] = "Ä"; (*em)["ni"] = "\u220b";
(*em)["Aring"] = "Å"; (*em)["eta"] = "\u03b7";
(*em)["AElig"] = "Æ"; (*em)["uArr"] = "\u21d1";
(*em)["Ccedil"] = "Ç"; (*em)["image"] = "\u2111";
(*em)["Egrave"] = "È"; (*em)["asymp"] = "\u2248";
(*em)["Eacute"] = "É"; (*em)["oacute"] = "\u00f3";
(*em)["Ecirc"] = "Ê"; (*em)["rarr"] = "\u2192";
(*em)["Euml"] = "Ë"; (*em)["emsp"] = "\u2003";
(*em)["Igrave"] = "Ì"; (*em)["acirc"] = "\u00e2";
(*em)["Iacute"] = "Í"; (*em)["shy"] = "\u00ad";
(*em)["Icirc"] = "Î"; (*em)["yuml"] = "\u00ff";
(*em)["Iuml"] = "Ï"; (*em)["acute"] = "\u00b4";
(*em)["ETH"] = "Ð"; (*em)["int"] = "\u222b";
(*em)["Ntilde"] = "Ñ"; (*em)["ccedil"] = "\u00e7";
(*em)["Ograve"] = "Ò"; (*em)["Acirc"] = "\u00c2";
(*em)["Oacute"] = "Ó"; (*em)["Ograve"] = "\u00d2";
(*em)["Ocirc"] = "Ô"; (*em)["times"] = "\u00d7";
(*em)["Otilde"] = "Õ"; (*em)["weierp"] = "\u2118";
(*em)["Ouml"] = "Ö"; (*em)["Tau"] = "\u03a4";
(*em)["times"] = "×"; (*em)["omicron"] = "\u03bf";
(*em)["Oslash"] = "Ø"; (*em)["lt"] = "\u003c";
(*em)["Ugrave"] = "Ù"; (*em)["Mu"] = "\u039c";
(*em)["Uacute"] = "Ú"; (*em)["Ucirc"] = "\u00db";
(*em)["Ucirc"] = "Û"; (*em)["sub"] = "\u2282";
(*em)["Uuml"] = "Ü"; (*em)["le"] = "\u2264";
(*em)["Yacute"] = "Ý"; (*em)["sum"] = "\u2211";
(*em)["THORN"] = "Þ"; (*em)["sup"] = "\u2283";
(*em)["szlig"] = "ß"; (*em)["lrm"] = "\u200e";
(*em)["agrave"] = "à"; (*em)["frac34"] = "\u00be";
(*em)["aacute"] = "á"; (*em)["Iota"] = "\u0399";
(*em)["acirc"] = "â"; (*em)["Ugrave"] = "\u00d9";
(*em)["atilde"] = "ã"; (*em)["THORN"] = "\u00de";
(*em)["auml"] = "ä"; (*em)["rsaquo"] = "\u203a";
(*em)["aring"] = "å"; (*em)["not"] = "\u00ac";
(*em)["aelig"] = "æ"; (*em)["sigma"] = "\u03c3";
(*em)["ccedil"] = "ç"; (*em)["iuml"] = "\u00ef";
(*em)["egrave"] = "è"; (*em)["epsilon"] = "\u03b5";
(*em)["eacute"] = "é"; (*em)["spades"] = "\u2660";
(*em)["ecirc"] = "ê"; (*em)["theta"] = "\u03b8";
(*em)["euml"] = "ë"; (*em)["divide"] = "\u00f7";
(*em)["igrave"] = "ì"; (*em)["Atilde"] = "\u00c3";
(*em)["iacute"] = "í"; (*em)["uacute"] = "\u00fa";
(*em)["icirc"] = "î"; (*em)["Rho"] = "\u03a1";
(*em)["iuml"] = "ï"; (*em)["trade"] = "\u2122";
(*em)["eth"] = "ð"; (*em)["chi"] = "\u03c7";
(*em)["ntilde"] = "ñ"; (*em)["agrave"] = "\u00e0";
(*em)["ograve"] = "ò"; (*em)["or"] = "\u2228";
(*em)["oacute"] = "ó"; (*em)["circ"] = "\u02c6";
(*em)["ocirc"] = "ô"; (*em)["middot"] = "\u00b7";
(*em)["otilde"] = "õ"; (*em)["plusmn"] = "\u00b1";
(*em)["ouml"] = "ö"; (*em)["aring"] = "\u00e5";
(*em)["divide"] = "÷"; (*em)["lsquo"] = "\u2018";
(*em)["oslash"] = "ø"; (*em)["Yacute"] = "\u00dd";
(*em)["ugrave"] = "ù"; (*em)["oline"] = "\u203e";
(*em)["uacute"] = "ú"; (*em)["copy"] = "\u00a9";
(*em)["ucirc"] = "û"; (*em)["icirc"] = "\u00ee";
(*em)["uuml"] = "ü"; (*em)["lowast"] = "\u2217";
(*em)["yacute"] = "ý"; (*em)["Oacute"] = "\u00d3";
(*em)["thorn"] = "þ"; (*em)["aacute"] = "\u00e1";
(*em)["yuml"] = "ÿ"; (*em)["oplus"] = "\u2295";
(*em)["fnof"] = "ƒ"; (*em)["crarr"] = "\u21b5";
(*em)["Alpha"] = "Α"; (*em)["thetasym"] = "\u03d1";
(*em)["Beta"] = "Β"; (*em)["Beta"] = "\u0392";
(*em)["Gamma"] = "Γ"; (*em)["laquo"] = "\u00ab";
(*em)["Delta"] = "Δ"; (*em)["rang"] = "\u232a";
(*em)["Epsilon"] = "Ε"; (*em)["tilde"] = "\u02dc";
(*em)["Zeta"] = "Ζ"; (*em)["Uuml"] = "\u00dc";
(*em)["Eta"] = "Η"; (*em)["zwj"] = "\u200d";
(*em)["Theta"] = "Θ"; (*em)["mu"] = "\u03bc";
(*em)["Iota"] = "Ι"; (*em)["Ccedil"] = "\u00c7";
(*em)["Kappa"] = "Κ"; (*em)["infin"] = "\u221e";
(*em)["Lambda"] = "Λ"; (*em)["ouml"] = "\u00f6";
(*em)["Mu"] = "Μ"; (*em)["rfloor"] = "\u230b";
(*em)["Nu"] = "Ν"; (*em)["pound"] = "\u00a3";
(*em)["Xi"] = "Ξ"; (*em)["szlig"] = "\u00df";
(*em)["Omicron"] = "Ο"; (*em)["thorn"] = "\u00fe";
(*em)["Pi"] = "Π"; (*em)["forall"] = "\u2200";
(*em)["Rho"] = "Ρ"; (*em)["piv"] = "\u03d6";
(*em)["Sigma"] = "Σ"; (*em)["rdquo"] = "\u201d";
(*em)["Tau"] = "Τ"; (*em)["frac12"] = "\u00bd";
(*em)["Upsilon"] = "Υ"; (*em)["frac14"] = "\u00bc";
(*em)["Phi"] = "Φ"; (*em)["Ocirc"] = "\u00d4";
(*em)["Chi"] = "Χ"; (*em)["Ecirc"] = "\u00ca";
(*em)["Psi"] = "Ψ"; (*em)["kappa"] = "\u03ba";
(*em)["Omega"] = "Ω"; (*em)["Euml"] = "\u00cb";
(*em)["alpha"] = "α"; (*em)["minus"] = "\u2212";
(*em)["beta"] = "β"; (*em)["cong"] = "\u2245";
(*em)["gamma"] = "γ"; (*em)["hellip"] = "\u2026";
(*em)["delta"] = "δ"; (*em)["equiv"] = "\u2261";
(*em)["epsilon"] = "ε"; (*em)["cent"] = "\u00a2";
(*em)["zeta"] = "ζ"; (*em)["Uacute"] = "\u00da";
(*em)["eta"] = "η"; (*em)["darr"] = "\u2193";
(*em)["theta"] = "θ"; (*em)["Eta"] = "\u0397";
(*em)["iota"] = "ι"; (*em)["sbquo"] = "\u201a";
(*em)["kappa"] = "κ"; (*em)["rArr"] = "\u21d2";
(*em)["lambda"] = "λ"; (*em)["igrave"] = "\u00ec";
(*em)["mu"] = "μ"; (*em)["uml"] = "\u00a8";
(*em)["nu"] = "ν"; (*em)["lambda"] = "\u03bb";
(*em)["xi"] = "ξ"; (*em)["oelig"] = "\u0153";
(*em)["omicron"] = "ο"; (*em)["harr"] = "\u2194";
(*em)["pi"] = "π"; (*em)["ang"] = "\u2220";
(*em)["rho"] = "ρ"; (*em)["clubs"] = "\u2663";
(*em)["sigmaf"] = "ς"; (*em)["and"] = "\u2227";
(*em)["sigma"] = "σ"; (*em)["permil"] = "\u2030";
(*em)["tau"] = "τ"; (*em)["larr"] = "\u2190";
(*em)["upsilon"] = "υ"; (*em)["Yuml"] = "\u0178";
(*em)["phi"] = "φ"; (*em)["cup"] = "\u222a";
(*em)["chi"] = "χ"; (*em)["Xi"] = "\u039e";
(*em)["psi"] = "ψ"; (*em)["Alpha"] = "\u0391";
(*em)["omega"] = "ω"; (*em)["phi"] = "\u03c6";
(*em)["thetasym"] = "ϑ"; (*em)["ucirc"] = "\u00fb";
(*em)["upsih"] = "ϒ"; (*em)["oslash"] = "\u00f8";
(*em)["piv"] = "ϖ"; (*em)["rsquo"] = "\u2019";
(*em)["bull"] = "•"; (*em)["AElig"] = "\u00c6";
(*em)["hellip"] = "…"; (*em)["mdash"] = "\u2014";
(*em)["prime"] = ""; (*em)["psi"] = "\u03c8";
(*em)["Prime"] = "″"; (*em)["eacute"] = "\u00e9";
(*em)["oline"] = "‾"; (*em)["otilde"] = "\u00f5";
(*em)["frasl"] = ""; (*em)["yen"] = "\u00a5";
(*em)["weierp"] = "℘"; (*em)["gt"] = "\u003e";
(*em)["image"] = ""; (*em)["Iuml"] = "\u00cf";
(*em)["real"] = ""; (*em)["Prime"] = "\u2033";
(*em)["trade"] = "™"; (*em)["Chi"] = "\u03a7";
(*em)["alefsym"] = "ℵ"; (*em)["ge"] = "\u2265";
(*em)["larr"] = "←"; (*em)["reg"] = "\u00ae";
(*em)["uarr"] = "↑"; (*em)["hearts"] = "\u2665";
(*em)["rarr"] = "→"; (*em)["auml"] = "\u00e4";
(*em)["darr"] = "↓"; (*em)["Agrave"] = "\u00c0";
(*em)["harr"] = "↔"; (*em)["sect"] = "\u00a7";
(*em)["crarr"] = "↵"; (*em)["sube"] = "\u2286";
(*em)["lArr"] = "⇐"; (*em)["sigmaf"] = "\u03c2";
(*em)["uArr"] = "⇑"; (*em)["Gamma"] = "\u0393";
(*em)["rArr"] = "⇒"; (*em)["amp"] = "\u0026";
(*em)["dArr"] = "⇓"; (*em)["ensp"] = "\u2002";
(*em)["hArr"] = "⇔"; (*em)["ETH"] = "\u00d0";
(*em)["forall"] = "∀"; (*em)["Igrave"] = "\u00cc";
(*em)["part"] = "∂"; (*em)["Omega"] = "\u03a9";
(*em)["exist"] = "∃"; (*em)["Lambda"] = "\u039b";
(*em)["empty"] = "∅"; (*em)["Omicron"] = "\u039f";
(*em)["nabla"] = "∇"; (*em)["there4"] = "\u2234";
(*em)["isin"] = "∈"; (*em)["ntilde"] = "\u00f1";
(*em)["notin"] = "∉"; (*em)["xi"] = "\u03be";
(*em)["ni"] = "∋"; (*em)["dagger"] = "\u2020";
(*em)["prod"] = "∏"; (*em)["egrave"] = "\u00e8";
(*em)["sum"] = "∑"; (*em)["Delta"] = "\u0394";
(*em)["minus"] = ""; (*em)["OElig"] = "\u0152";
(*em)["lowast"] = ""; (*em)["diams"] = "\u2666";
(*em)["radic"] = "√"; (*em)["ldquo"] = "\u201c";
(*em)["prop"] = "∝"; (*em)["radic"] = "\u221a";
(*em)["infin"] = "∞"; (*em)["Oslash"] = "\u00d8";
(*em)["ang"] = "∠"; (*em)["Ouml"] = "\u00d6";
(*em)["and"] = "∧"; (*em)["lceil"] = "\u2308";
(*em)["or"] = ""; (*em)["uarr"] = "\u2191";
(*em)["cap"] = "∩"; (*em)["atilde"] = "\u00e3";
(*em)["cup"] = ""; (*em)["iquest"] = "\u00bf";
(*em)["int"] = "∫"; (*em)["lsaquo"] = "\u2039";
(*em)["there4"] = "∴"; (*em)["Epsilon"] = "\u0395";
(*em)["sim"] = ""; (*em)["iacute"] = "\u00ed";
(*em)["cong"] = "≅"; (*em)["cap"] = "\u2229";
(*em)["asymp"] = "≈"; (*em)["deg"] = "\u00b0";
(*em)["ne"] = "≠"; (*em)["Otilde"] = "\u00d5";
(*em)["equiv"] = "≡"; (*em)["zeta"] = "\u03b6";
(*em)["le"] = "≤"; (*em)["ocirc"] = "\u00f4";
(*em)["ge"] = "≥"; (*em)["scaron"] = "\u0161";
(*em)["sub"] = "⊂"; (*em)["ecirc"] = "\u00ea";
(*em)["sup"] = "⊃"; (*em)["ordm"] = "\u00ba";
(*em)["nsub"] = "⊄"; (*em)["tau"] = "\u03c4";
(*em)["sube"] = "⊆"; (*em)["Auml"] = "\u00c4";
(*em)["supe"] = "⊇"; (*em)["dArr"] = "\u21d3";
(*em)["oplus"] = "⊕"; (*em)["ordf"] = "\u00aa";
(*em)["otimes"] = "⊗"; (*em)["alefsym"] = "\u2135";
(*em)["perp"] = "⊥"; (*em)["notin"] = "\u2209";
(*em)["sdot"] = "⋅"; (*em)["Pi"] = "\u03a0";
(*em)["lceil"] = "⌈"; (*em)["sdot"] = "\u22c5";
(*em)["rceil"] = "⌉"; (*em)["upsilon"] = "\u03c5";
(*em)["lfloor"] = "⌊"; (*em)["iota"] = "\u03b9";
(*em)["rfloor"] = "⌋"; (*em)["hArr"] = "\u21d4";
(*em)["lang"] = "〈"; (*em)["Sigma"] = "\u03a3";
(*em)["rang"] = "〉"; (*em)["lang"] = "\u2329";
(*em)["loz"] = "◊"; (*em)["curren"] = "\u00a4";
(*em)["spades"] = "♠"; (*em)["Theta"] = "\u0398";
(*em)["clubs"] = "♣"; (*em)["lArr"] = "\u21d0";
(*em)["hearts"] = "♥"; (*em)["Phi"] = "\u03a6";
(*em)["diams"] = "♦"; (*em)["Nu"] = "\u039d";
(*em)["quot"] = "\""; (*em)["rho"] = "\u03c1";
(*em)["amp"] = "&"; (*em)["alpha"] = "\u03b1";
(*em)["lt"] = "<"; (*em)["iexcl"] = "\u00a1";
(*em)["gt"] = ">"; (*em)["micro"] = "\u00b5";
(*em)["OElig"] = "Œ"; (*em)["cedil"] = "\u00b8";
(*em)["oelig"] = "œ"; (*em)["Ntilde"] = "\u00d1";
(*em)["Scaron"] = "Š"; (*em)["Psi"] = "\u03a8";
(*em)["scaron"] = "š"; (*em)["Dagger"] = "\u2021";
(*em)["Yuml"] = "Ÿ"; (*em)["Egrave"] = "\u00c8";
(*em)["circ"] = "ˆ"; (*em)["Icirc"] = "\u00ce";
(*em)["tilde"] = "˜"; (*em)["nsub"] = "\u2284";
(*em)["ensp"] = ""; (*em)["bdquo"] = "\u201e";
(*em)["emsp"] = ""; (*em)["empty"] = "\u2205";
(*em)["thinsp"] = ""; (*em)["aelig"] = "\u00e6";
(*em)["zwnj"] = ""; (*em)["ograve"] = "\u00f2";
(*em)["zwj"] = ""; (*em)["macr"] = "\u00af";
(*em)["lrm"] = ""; (*em)["Zeta"] = "\u0396";
(*em)["rlm"] = ""; (*em)["beta"] = "\u03b2";
(*em)["ndash"] = ""; (*em)["sim"] = "\u223c";
(*em)["mdash"] = "—"; (*em)["uuml"] = "\u00fc";
(*em)["lsquo"] = ""; (*em)["Aacute"] = "\u00c1";
(*em)["rsquo"] = ""; (*em)["Iacute"] = "\u00cd";
(*em)["sbquo"] = ""; (*em)["exist"] = "\u2203";
(*em)["ldquo"] = "“"; (*em)["prime"] = "\u2032";
(*em)["rdquo"] = "”"; (*em)["rceil"] = "\u2309";
(*em)["bdquo"] = "„"; (*em)["real"] = "\u211c";
(*em)["dagger"] = "†"; (*em)["zwnj"] = "\u200c";
(*em)["Dagger"] = "‡"; (*em)["bull"] = "\u2022";
(*em)["permil"] = "‰"; (*em)["quot"] = "\u0022";
(*em)["lsaquo"] = ""; (*em)["Scaron"] = "\u0160";
(*em)["rsaquo"] = ""; (*em)["ugrave"] = "\u00f9";
(*em)["euro"] = "€";
} }