rewrote loadNonStandardEntities() to use a more reliable token representation for each character: "\uxxxx" notation.

This commit is contained in:
jim teeuwen 2009-12-02 21:44:02 +01:00
parent 664e4ae0d2
commit 379dfddd5b
1 changed files with 252 additions and 254 deletions

View File

@ -33,7 +33,6 @@ func UTF8ToHtml(token string) string {
return fmt.Sprintf("&#%d;", rune);
}
/*
http://www.w3.org/TR/html4/sgml/entities.html
@ -49,259 +48,258 @@ func UTF8ToHtml(token string) string {
error and abort the parsing. Hence the ability to supply this map.
*/
func loadNonStandardEntities(em *map[string]string) {
// Generic entities string([]uint8{160});
(*em)["nbsp"] = " ";
(*em)["iexcl"] = "¡";
(*em)["cent"] = "¢";
(*em)["pound"] = "£";
(*em)["curren"] = "¤";
(*em)["yen"] = "¥";
(*em)["brvbar"] = "¦";
(*em)["sect"] = "§";
(*em)["uml"] = "¨";
(*em)["copy"] = "©";
(*em)["ordf"] = "ª";
(*em)["laquo"] = "«";
(*em)["not"] = "¬";
(*em)["shy"] = "­";
(*em)["reg"] = "®";
(*em)["macr"] = "¯";
(*em)["deg"] = "°";
(*em)["plusmn"] = "±";
(*em)["sup"] = "²";
(*em)["sup"] = "³";
(*em)["acute"] = "´";
(*em)["micro"] = "µ";
(*em)["para"] = "¶";
(*em)["middot"] = "·";
(*em)["cedil"] = "¸";
(*em)["sup"] = "¹";
(*em)["ordm"] = "º";
(*em)["raquo"] = "»";
(*em)["frac14"] = "¼";
(*em)["frac12"] = "½";
(*em)["frac34"] = "¾";
(*em)["iquest"] = "¿";
(*em)["Agrave"] = "À";
(*em)["Aacute"] = "Á";
(*em)["Acirc"] = "Â";
(*em)["Atilde"] = "Ã";
(*em)["Auml"] = "Ä";
(*em)["Aring"] = "Å";
(*em)["AElig"] = "Æ";
(*em)["Ccedil"] = "Ç";
(*em)["Egrave"] = "È";
(*em)["Eacute"] = "É";
(*em)["Ecirc"] = "Ê";
(*em)["Euml"] = "Ë";
(*em)["Igrave"] = "Ì";
(*em)["Iacute"] = "Í";
(*em)["Icirc"] = "Î";
(*em)["Iuml"] = "Ï";
(*em)["ETH"] = "Ð";
(*em)["Ntilde"] = "Ñ";
(*em)["Ograve"] = "Ò";
(*em)["Oacute"] = "Ó";
(*em)["Ocirc"] = "Ô";
(*em)["Otilde"] = "Õ";
(*em)["Ouml"] = "Ö";
(*em)["times"] = "×";
(*em)["Oslash"] = "Ø";
(*em)["Ugrave"] = "Ù";
(*em)["Uacute"] = "Ú";
(*em)["Ucirc"] = "Û";
(*em)["Uuml"] = "Ü";
(*em)["Yacute"] = "Ý";
(*em)["THORN"] = "Þ";
(*em)["szlig"] = "ß";
(*em)["agrave"] = "à";
(*em)["aacute"] = "á";
(*em)["acirc"] = "â";
(*em)["atilde"] = "ã";
(*em)["auml"] = "ä";
(*em)["aring"] = "å";
(*em)["aelig"] = "æ";
(*em)["ccedil"] = "ç";
(*em)["egrave"] = "è";
(*em)["eacute"] = "é";
(*em)["ecirc"] = "ê";
(*em)["euml"] = "ë";
(*em)["igrave"] = "ì";
(*em)["iacute"] = "í";
(*em)["icirc"] = "î";
(*em)["iuml"] = "ï";
(*em)["eth"] = "ð";
(*em)["ntilde"] = "ñ";
(*em)["ograve"] = "ò";
(*em)["oacute"] = "ó";
(*em)["ocirc"] = "ô";
(*em)["otilde"] = "õ";
(*em)["ouml"] = "ö";
(*em)["divide"] = "÷";
(*em)["oslash"] = "ø";
(*em)["ugrave"] = "ù";
(*em)["uacute"] = "ú";
(*em)["ucirc"] = "û";
(*em)["uuml"] = "ü";
(*em)["yacute"] = "ý";
(*em)["thorn"] = "þ";
(*em)["yuml"] = "ÿ";
(*em)["fnof"] = "ƒ";
(*em)["Alpha"] = "Α";
(*em)["Beta"] = "Β";
(*em)["Gamma"] = "Γ";
(*em)["Delta"] = "Δ";
(*em)["Epsilon"] = "Ε";
(*em)["Zeta"] = "Ζ";
(*em)["Eta"] = "Η";
(*em)["Theta"] = "Θ";
(*em)["Iota"] = "Ι";
(*em)["Kappa"] = "Κ";
(*em)["Lambda"] = "Λ";
(*em)["Mu"] = "Μ";
(*em)["Nu"] = "Ν";
(*em)["Xi"] = "Ξ";
(*em)["Omicron"] = "Ο";
(*em)["Pi"] = "Π";
(*em)["Rho"] = "Ρ";
(*em)["Sigma"] = "Σ";
(*em)["Tau"] = "Τ";
(*em)["Upsilon"] = "Υ";
(*em)["Phi"] = "Φ";
(*em)["Chi"] = "Χ";
(*em)["Psi"] = "Ψ";
(*em)["Omega"] = "Ω";
(*em)["alpha"] = "α";
(*em)["beta"] = "β";
(*em)["gamma"] = "γ";
(*em)["delta"] = "δ";
(*em)["epsilon"] = "ε";
(*em)["zeta"] = "ζ";
(*em)["eta"] = "η";
(*em)["theta"] = "θ";
(*em)["iota"] = "ι";
(*em)["kappa"] = "κ";
(*em)["lambda"] = "λ";
(*em)["mu"] = "μ";
(*em)["nu"] = "ν";
(*em)["xi"] = "ξ";
(*em)["omicron"] = "ο";
(*em)["pi"] = "π";
(*em)["rho"] = "ρ";
(*em)["sigmaf"] = "ς";
(*em)["sigma"] = "σ";
(*em)["tau"] = "τ";
(*em)["upsilon"] = "υ";
(*em)["phi"] = "φ";
(*em)["chi"] = "χ";
(*em)["psi"] = "ψ";
(*em)["omega"] = "ω";
(*em)["thetasym"] = "ϑ";
(*em)["upsih"] = "ϒ";
(*em)["piv"] = "ϖ";
(*em)["bull"] = "•";
(*em)["hellip"] = "…";
(*em)["prime"] = "";
(*em)["Prime"] = "″";
(*em)["oline"] = "‾";
(*em)["frasl"] = "";
(*em)["weierp"] = "℘";
(*em)["image"] = "";
(*em)["real"] = "";
(*em)["trade"] = "™";
(*em)["alefsym"] = "ℵ";
(*em)["larr"] = "←";
(*em)["uarr"] = "↑";
(*em)["rarr"] = "→";
(*em)["darr"] = "↓";
(*em)["harr"] = "↔";
(*em)["crarr"] = "↵";
(*em)["lArr"] = "⇐";
(*em)["uArr"] = "⇑";
(*em)["rArr"] = "⇒";
(*em)["dArr"] = "⇓";
(*em)["hArr"] = "⇔";
(*em)["forall"] = "∀";
(*em)["part"] = "∂";
(*em)["exist"] = "∃";
(*em)["empty"] = "∅";
(*em)["nabla"] = "∇";
(*em)["isin"] = "∈";
(*em)["notin"] = "∉";
(*em)["ni"] = "∋";
(*em)["prod"] = "∏";
(*em)["sum"] = "∑";
(*em)["minus"] = "";
(*em)["lowast"] = "";
(*em)["radic"] = "√";
(*em)["prop"] = "∝";
(*em)["infin"] = "∞";
(*em)["ang"] = "∠";
(*em)["and"] = "∧";
(*em)["or"] = "";
(*em)["cap"] = "∩";
(*em)["cup"] = "";
(*em)["int"] = "∫";
(*em)["there4"] = "∴";
(*em)["sim"] = "";
(*em)["cong"] = "≅";
(*em)["asymp"] = "≈";
(*em)["ne"] = "≠";
(*em)["equiv"] = "≡";
(*em)["le"] = "≤";
(*em)["ge"] = "≥";
(*em)["sub"] = "⊂";
(*em)["sup"] = "⊃";
(*em)["nsub"] = "⊄";
(*em)["sube"] = "⊆";
(*em)["supe"] = "⊇";
(*em)["oplus"] = "⊕";
(*em)["otimes"] = "⊗";
(*em)["perp"] = "⊥";
(*em)["sdot"] = "⋅";
(*em)["lceil"] = "⌈";
(*em)["rceil"] = "⌉";
(*em)["lfloor"] = "⌊";
(*em)["rfloor"] = "⌋";
(*em)["lang"] = "〈";
(*em)["rang"] = "〉";
(*em)["loz"] = "◊";
(*em)["spades"] = "♠";
(*em)["clubs"] = "♣";
(*em)["hearts"] = "♥";
(*em)["diams"] = "♦";
(*em)["quot"] = "\"";
(*em)["amp"] = "&";
(*em)["lt"] = "<";
(*em)["gt"] = ">";
(*em)["OElig"] = "Œ";
(*em)["oelig"] = "œ";
(*em)["Scaron"] = "Š";
(*em)["scaron"] = "š";
(*em)["Yuml"] = "Ÿ";
(*em)["circ"] = "ˆ";
(*em)["tilde"] = "˜";
(*em)["ensp"] = "";
(*em)["emsp"] = "";
(*em)["thinsp"] = "";
(*em)["zwnj"] = "";
(*em)["zwj"] = "";
(*em)["lrm"] = "";
(*em)["rlm"] = "";
(*em)["ndash"] = "";
(*em)["mdash"] = "—";
(*em)["lsquo"] = "";
(*em)["rsquo"] = "";
(*em)["sbquo"] = "";
(*em)["ldquo"] = "“";
(*em)["rdquo"] = "”";
(*em)["bdquo"] = "„";
(*em)["dagger"] = "†";
(*em)["Dagger"] = "‡";
(*em)["permil"] = "‰";
(*em)["lsaquo"] = "";
(*em)["rsaquo"] = "";
(*em)["euro"] = "€";
(*em)["pi"] = "\u03c0";
(*em)["nabla"] = "\u2207";
(*em)["isin"] = "\u2208";
(*em)["loz"] = "\u25ca";
(*em)["prop"] = "\u221d";
(*em)["para"] = "\u00b6";
(*em)["Aring"] = "\u00c5";
(*em)["euro"] = "\u20ac";
(*em)["sup3"] = "\u00b3";
(*em)["sup2"] = "\u00b2";
(*em)["sup1"] = "\u00b9";
(*em)["prod"] = "\u220f";
(*em)["gamma"] = "\u03b3";
(*em)["perp"] = "\u22a5";
(*em)["lfloor"] = "\u230a";
(*em)["fnof"] = "\u0192";
(*em)["frasl"] = "\u2044";
(*em)["rlm"] = "\u200f";
(*em)["omega"] = "\u03c9";
(*em)["part"] = "\u2202";
(*em)["euml"] = "\u00eb";
(*em)["Kappa"] = "\u039a";
(*em)["nbsp"] = "\u00a0";
(*em)["Eacute"] = "\u00c9";
(*em)["brvbar"] = "\u00a6";
(*em)["otimes"] = "\u2297";
(*em)["ndash"] = "\u2013";
(*em)["thinsp"] = "\u2009";
(*em)["nu"] = "\u03bd";
(*em)["Upsilon"] = "\u03a5";
(*em)["upsih"] = "\u03d2";
(*em)["raquo"] = "\u00bb";
(*em)["yacute"] = "\u00fd";
(*em)["delta"] = "\u03b4";
(*em)["eth"] = "\u00f0";
(*em)["supe"] = "\u2287";
(*em)["ne"] = "\u2260";
(*em)["ni"] = "\u220b";
(*em)["eta"] = "\u03b7";
(*em)["uArr"] = "\u21d1";
(*em)["image"] = "\u2111";
(*em)["asymp"] = "\u2248";
(*em)["oacute"] = "\u00f3";
(*em)["rarr"] = "\u2192";
(*em)["emsp"] = "\u2003";
(*em)["acirc"] = "\u00e2";
(*em)["shy"] = "\u00ad";
(*em)["yuml"] = "\u00ff";
(*em)["acute"] = "\u00b4";
(*em)["int"] = "\u222b";
(*em)["ccedil"] = "\u00e7";
(*em)["Acirc"] = "\u00c2";
(*em)["Ograve"] = "\u00d2";
(*em)["times"] = "\u00d7";
(*em)["weierp"] = "\u2118";
(*em)["Tau"] = "\u03a4";
(*em)["omicron"] = "\u03bf";
(*em)["lt"] = "\u003c";
(*em)["Mu"] = "\u039c";
(*em)["Ucirc"] = "\u00db";
(*em)["sub"] = "\u2282";
(*em)["le"] = "\u2264";
(*em)["sum"] = "\u2211";
(*em)["sup"] = "\u2283";
(*em)["lrm"] = "\u200e";
(*em)["frac34"] = "\u00be";
(*em)["Iota"] = "\u0399";
(*em)["Ugrave"] = "\u00d9";
(*em)["THORN"] = "\u00de";
(*em)["rsaquo"] = "\u203a";
(*em)["not"] = "\u00ac";
(*em)["sigma"] = "\u03c3";
(*em)["iuml"] = "\u00ef";
(*em)["epsilon"] = "\u03b5";
(*em)["spades"] = "\u2660";
(*em)["theta"] = "\u03b8";
(*em)["divide"] = "\u00f7";
(*em)["Atilde"] = "\u00c3";
(*em)["uacute"] = "\u00fa";
(*em)["Rho"] = "\u03a1";
(*em)["trade"] = "\u2122";
(*em)["chi"] = "\u03c7";
(*em)["agrave"] = "\u00e0";
(*em)["or"] = "\u2228";
(*em)["circ"] = "\u02c6";
(*em)["middot"] = "\u00b7";
(*em)["plusmn"] = "\u00b1";
(*em)["aring"] = "\u00e5";
(*em)["lsquo"] = "\u2018";
(*em)["Yacute"] = "\u00dd";
(*em)["oline"] = "\u203e";
(*em)["copy"] = "\u00a9";
(*em)["icirc"] = "\u00ee";
(*em)["lowast"] = "\u2217";
(*em)["Oacute"] = "\u00d3";
(*em)["aacute"] = "\u00e1";
(*em)["oplus"] = "\u2295";
(*em)["crarr"] = "\u21b5";
(*em)["thetasym"] = "\u03d1";
(*em)["Beta"] = "\u0392";
(*em)["laquo"] = "\u00ab";
(*em)["rang"] = "\u232a";
(*em)["tilde"] = "\u02dc";
(*em)["Uuml"] = "\u00dc";
(*em)["zwj"] = "\u200d";
(*em)["mu"] = "\u03bc";
(*em)["Ccedil"] = "\u00c7";
(*em)["infin"] = "\u221e";
(*em)["ouml"] = "\u00f6";
(*em)["rfloor"] = "\u230b";
(*em)["pound"] = "\u00a3";
(*em)["szlig"] = "\u00df";
(*em)["thorn"] = "\u00fe";
(*em)["forall"] = "\u2200";
(*em)["piv"] = "\u03d6";
(*em)["rdquo"] = "\u201d";
(*em)["frac12"] = "\u00bd";
(*em)["frac14"] = "\u00bc";
(*em)["Ocirc"] = "\u00d4";
(*em)["Ecirc"] = "\u00ca";
(*em)["kappa"] = "\u03ba";
(*em)["Euml"] = "\u00cb";
(*em)["minus"] = "\u2212";
(*em)["cong"] = "\u2245";
(*em)["hellip"] = "\u2026";
(*em)["equiv"] = "\u2261";
(*em)["cent"] = "\u00a2";
(*em)["Uacute"] = "\u00da";
(*em)["darr"] = "\u2193";
(*em)["Eta"] = "\u0397";
(*em)["sbquo"] = "\u201a";
(*em)["rArr"] = "\u21d2";
(*em)["igrave"] = "\u00ec";
(*em)["uml"] = "\u00a8";
(*em)["lambda"] = "\u03bb";
(*em)["oelig"] = "\u0153";
(*em)["harr"] = "\u2194";
(*em)["ang"] = "\u2220";
(*em)["clubs"] = "\u2663";
(*em)["and"] = "\u2227";
(*em)["permil"] = "\u2030";
(*em)["larr"] = "\u2190";
(*em)["Yuml"] = "\u0178";
(*em)["cup"] = "\u222a";
(*em)["Xi"] = "\u039e";
(*em)["Alpha"] = "\u0391";
(*em)["phi"] = "\u03c6";
(*em)["ucirc"] = "\u00fb";
(*em)["oslash"] = "\u00f8";
(*em)["rsquo"] = "\u2019";
(*em)["AElig"] = "\u00c6";
(*em)["mdash"] = "\u2014";
(*em)["psi"] = "\u03c8";
(*em)["eacute"] = "\u00e9";
(*em)["otilde"] = "\u00f5";
(*em)["yen"] = "\u00a5";
(*em)["gt"] = "\u003e";
(*em)["Iuml"] = "\u00cf";
(*em)["Prime"] = "\u2033";
(*em)["Chi"] = "\u03a7";
(*em)["ge"] = "\u2265";
(*em)["reg"] = "\u00ae";
(*em)["hearts"] = "\u2665";
(*em)["auml"] = "\u00e4";
(*em)["Agrave"] = "\u00c0";
(*em)["sect"] = "\u00a7";
(*em)["sube"] = "\u2286";
(*em)["sigmaf"] = "\u03c2";
(*em)["Gamma"] = "\u0393";
(*em)["amp"] = "\u0026";
(*em)["ensp"] = "\u2002";
(*em)["ETH"] = "\u00d0";
(*em)["Igrave"] = "\u00cc";
(*em)["Omega"] = "\u03a9";
(*em)["Lambda"] = "\u039b";
(*em)["Omicron"] = "\u039f";
(*em)["there4"] = "\u2234";
(*em)["ntilde"] = "\u00f1";
(*em)["xi"] = "\u03be";
(*em)["dagger"] = "\u2020";
(*em)["egrave"] = "\u00e8";
(*em)["Delta"] = "\u0394";
(*em)["OElig"] = "\u0152";
(*em)["diams"] = "\u2666";
(*em)["ldquo"] = "\u201c";
(*em)["radic"] = "\u221a";
(*em)["Oslash"] = "\u00d8";
(*em)["Ouml"] = "\u00d6";
(*em)["lceil"] = "\u2308";
(*em)["uarr"] = "\u2191";
(*em)["atilde"] = "\u00e3";
(*em)["iquest"] = "\u00bf";
(*em)["lsaquo"] = "\u2039";
(*em)["Epsilon"] = "\u0395";
(*em)["iacute"] = "\u00ed";
(*em)["cap"] = "\u2229";
(*em)["deg"] = "\u00b0";
(*em)["Otilde"] = "\u00d5";
(*em)["zeta"] = "\u03b6";
(*em)["ocirc"] = "\u00f4";
(*em)["scaron"] = "\u0161";
(*em)["ecirc"] = "\u00ea";
(*em)["ordm"] = "\u00ba";
(*em)["tau"] = "\u03c4";
(*em)["Auml"] = "\u00c4";
(*em)["dArr"] = "\u21d3";
(*em)["ordf"] = "\u00aa";
(*em)["alefsym"] = "\u2135";
(*em)["notin"] = "\u2209";
(*em)["Pi"] = "\u03a0";
(*em)["sdot"] = "\u22c5";
(*em)["upsilon"] = "\u03c5";
(*em)["iota"] = "\u03b9";
(*em)["hArr"] = "\u21d4";
(*em)["Sigma"] = "\u03a3";
(*em)["lang"] = "\u2329";
(*em)["curren"] = "\u00a4";
(*em)["Theta"] = "\u0398";
(*em)["lArr"] = "\u21d0";
(*em)["Phi"] = "\u03a6";
(*em)["Nu"] = "\u039d";
(*em)["rho"] = "\u03c1";
(*em)["alpha"] = "\u03b1";
(*em)["iexcl"] = "\u00a1";
(*em)["micro"] = "\u00b5";
(*em)["cedil"] = "\u00b8";
(*em)["Ntilde"] = "\u00d1";
(*em)["Psi"] = "\u03a8";
(*em)["Dagger"] = "\u2021";
(*em)["Egrave"] = "\u00c8";
(*em)["Icirc"] = "\u00ce";
(*em)["nsub"] = "\u2284";
(*em)["bdquo"] = "\u201e";
(*em)["empty"] = "\u2205";
(*em)["aelig"] = "\u00e6";
(*em)["ograve"] = "\u00f2";
(*em)["macr"] = "\u00af";
(*em)["Zeta"] = "\u0396";
(*em)["beta"] = "\u03b2";
(*em)["sim"] = "\u223c";
(*em)["uuml"] = "\u00fc";
(*em)["Aacute"] = "\u00c1";
(*em)["Iacute"] = "\u00cd";
(*em)["exist"] = "\u2203";
(*em)["prime"] = "\u2032";
(*em)["rceil"] = "\u2309";
(*em)["real"] = "\u211c";
(*em)["zwnj"] = "\u200c";
(*em)["bull"] = "\u2022";
(*em)["quot"] = "\u0022";
(*em)["Scaron"] = "\u0160";
(*em)["ugrave"] = "\u00f9";
}