package xmlx import "fmt" import "utf8" import "regexp" import "strconv" var reg_entity = regexp.MustCompile("^&#[0-9]+;$"); // Converts a single numerical html entity to a regular Go utf-token. // ex: "♣" -> "♣" func HtmlToUTF8(entity string) string { // Make sure we have a valid entity: { ok := reg_entity.MatchString(entity); if !ok { return "" } // Convert entity to number num, err := strconv.Atoi(entity[2:len(entity)-1]); if err != nil { return "" } var arr [3]byte; size := utf8.EncodeRune(num, &arr); if size == 0 { return "" } return string(&arr); } // Converts a single Go utf-token to it's an Html entity. // ex: "♣" -> "♣" func UTF8ToHtml(token string) string { rune, size := utf8.DecodeRuneInString(token); if size == 0 { return "" } return fmt.Sprintf("&#%d;", rune); } /* http://www.w3.org/TR/html4/sgml/entities.html Portions © International Organization for Standardization 1986 Permission to copy in any form is granted for use with conforming SGML systems and applications as defined in ISO 8879, provided this notice is included in all copies. Fills the supplied map with html entities mapped to their Go utf8 equivalents. This map can be assigned to xml.Parser.Entity It will be used to map non-standard xml entities to a proper value. If the parser encounters any unknown entities, it will throw a syntax error and abort the parsing. Hence the ability to supply this map. */ func loadNonStandardEntities(em *map[string]string) { (*em)["pi"] = "\u03c0"; (*em)["nabla"] = "\u2207"; (*em)["isin"] = "\u2208"; (*em)["loz"] = "\u25ca"; (*em)["prop"] = "\u221d"; (*em)["para"] = "\u00b6"; (*em)["Aring"] = "\u00c5"; (*em)["euro"] = "\u20ac"; (*em)["sup3"] = "\u00b3"; (*em)["sup2"] = "\u00b2"; (*em)["sup1"] = "\u00b9"; (*em)["prod"] = "\u220f"; (*em)["gamma"] = "\u03b3"; (*em)["perp"] = "\u22a5"; (*em)["lfloor"] = "\u230a"; (*em)["fnof"] = "\u0192"; (*em)["frasl"] = "\u2044"; (*em)["rlm"] = "\u200f"; (*em)["omega"] = "\u03c9"; (*em)["part"] = "\u2202"; (*em)["euml"] = "\u00eb"; (*em)["Kappa"] = "\u039a"; (*em)["nbsp"] = "\u00a0"; (*em)["Eacute"] = "\u00c9"; (*em)["brvbar"] = "\u00a6"; (*em)["otimes"] = "\u2297"; (*em)["ndash"] = "\u2013"; (*em)["thinsp"] = "\u2009"; (*em)["nu"] = "\u03bd"; (*em)["Upsilon"] = "\u03a5"; (*em)["upsih"] = "\u03d2"; (*em)["raquo"] = "\u00bb"; (*em)["yacute"] = "\u00fd"; (*em)["delta"] = "\u03b4"; (*em)["eth"] = "\u00f0"; (*em)["supe"] = "\u2287"; (*em)["ne"] = "\u2260"; (*em)["ni"] = "\u220b"; (*em)["eta"] = "\u03b7"; (*em)["uArr"] = "\u21d1"; (*em)["image"] = "\u2111"; (*em)["asymp"] = "\u2248"; (*em)["oacute"] = "\u00f3"; (*em)["rarr"] = "\u2192"; (*em)["emsp"] = "\u2003"; (*em)["acirc"] = "\u00e2"; (*em)["shy"] = "\u00ad"; (*em)["yuml"] = "\u00ff"; (*em)["acute"] = "\u00b4"; (*em)["int"] = "\u222b"; (*em)["ccedil"] = "\u00e7"; (*em)["Acirc"] = "\u00c2"; (*em)["Ograve"] = "\u00d2"; (*em)["times"] = "\u00d7"; (*em)["weierp"] = "\u2118"; (*em)["Tau"] = "\u03a4"; (*em)["omicron"] = "\u03bf"; (*em)["lt"] = "\u003c"; (*em)["Mu"] = "\u039c"; (*em)["Ucirc"] = "\u00db"; (*em)["sub"] = "\u2282"; (*em)["le"] = "\u2264"; (*em)["sum"] = "\u2211"; (*em)["sup"] = "\u2283"; (*em)["lrm"] = "\u200e"; (*em)["frac34"] = "\u00be"; (*em)["Iota"] = "\u0399"; (*em)["Ugrave"] = "\u00d9"; (*em)["THORN"] = "\u00de"; (*em)["rsaquo"] = "\u203a"; (*em)["not"] = "\u00ac"; (*em)["sigma"] = "\u03c3"; (*em)["iuml"] = "\u00ef"; (*em)["epsilon"] = "\u03b5"; (*em)["spades"] = "\u2660"; (*em)["theta"] = "\u03b8"; (*em)["divide"] = "\u00f7"; (*em)["Atilde"] = "\u00c3"; (*em)["uacute"] = "\u00fa"; (*em)["Rho"] = "\u03a1"; (*em)["trade"] = "\u2122"; (*em)["chi"] = "\u03c7"; (*em)["agrave"] = "\u00e0"; (*em)["or"] = "\u2228"; (*em)["circ"] = "\u02c6"; (*em)["middot"] = "\u00b7"; (*em)["plusmn"] = "\u00b1"; (*em)["aring"] = "\u00e5"; (*em)["lsquo"] = "\u2018"; (*em)["Yacute"] = "\u00dd"; (*em)["oline"] = "\u203e"; (*em)["copy"] = "\u00a9"; (*em)["icirc"] = "\u00ee"; (*em)["lowast"] = "\u2217"; (*em)["Oacute"] = "\u00d3"; (*em)["aacute"] = "\u00e1"; (*em)["oplus"] = "\u2295"; (*em)["crarr"] = "\u21b5"; (*em)["thetasym"] = "\u03d1"; (*em)["Beta"] = "\u0392"; (*em)["laquo"] = "\u00ab"; (*em)["rang"] = "\u232a"; (*em)["tilde"] = "\u02dc"; (*em)["Uuml"] = "\u00dc"; (*em)["zwj"] = "\u200d"; (*em)["mu"] = "\u03bc"; (*em)["Ccedil"] = "\u00c7"; (*em)["infin"] = "\u221e"; (*em)["ouml"] = "\u00f6"; (*em)["rfloor"] = "\u230b"; (*em)["pound"] = "\u00a3"; (*em)["szlig"] = "\u00df"; (*em)["thorn"] = "\u00fe"; (*em)["forall"] = "\u2200"; (*em)["piv"] = "\u03d6"; (*em)["rdquo"] = "\u201d"; (*em)["frac12"] = "\u00bd"; (*em)["frac14"] = "\u00bc"; (*em)["Ocirc"] = "\u00d4"; (*em)["Ecirc"] = "\u00ca"; (*em)["kappa"] = "\u03ba"; (*em)["Euml"] = "\u00cb"; (*em)["minus"] = "\u2212"; (*em)["cong"] = "\u2245"; (*em)["hellip"] = "\u2026"; (*em)["equiv"] = "\u2261"; (*em)["cent"] = "\u00a2"; (*em)["Uacute"] = "\u00da"; (*em)["darr"] = "\u2193"; (*em)["Eta"] = "\u0397"; (*em)["sbquo"] = "\u201a"; (*em)["rArr"] = "\u21d2"; (*em)["igrave"] = "\u00ec"; (*em)["uml"] = "\u00a8"; (*em)["lambda"] = "\u03bb"; (*em)["oelig"] = "\u0153"; (*em)["harr"] = "\u2194"; (*em)["ang"] = "\u2220"; (*em)["clubs"] = "\u2663"; (*em)["and"] = "\u2227"; (*em)["permil"] = "\u2030"; (*em)["larr"] = "\u2190"; (*em)["Yuml"] = "\u0178"; (*em)["cup"] = "\u222a"; (*em)["Xi"] = "\u039e"; (*em)["Alpha"] = "\u0391"; (*em)["phi"] = "\u03c6"; (*em)["ucirc"] = "\u00fb"; (*em)["oslash"] = "\u00f8"; (*em)["rsquo"] = "\u2019"; (*em)["AElig"] = "\u00c6"; (*em)["mdash"] = "\u2014"; (*em)["psi"] = "\u03c8"; (*em)["eacute"] = "\u00e9"; (*em)["otilde"] = "\u00f5"; (*em)["yen"] = "\u00a5"; (*em)["gt"] = "\u003e"; (*em)["Iuml"] = "\u00cf"; (*em)["Prime"] = "\u2033"; (*em)["Chi"] = "\u03a7"; (*em)["ge"] = "\u2265"; (*em)["reg"] = "\u00ae"; (*em)["hearts"] = "\u2665"; (*em)["auml"] = "\u00e4"; (*em)["Agrave"] = "\u00c0"; (*em)["sect"] = "\u00a7"; (*em)["sube"] = "\u2286"; (*em)["sigmaf"] = "\u03c2"; (*em)["Gamma"] = "\u0393"; (*em)["amp"] = "\u0026"; (*em)["ensp"] = "\u2002"; (*em)["ETH"] = "\u00d0"; (*em)["Igrave"] = "\u00cc"; (*em)["Omega"] = "\u03a9"; (*em)["Lambda"] = "\u039b"; (*em)["Omicron"] = "\u039f"; (*em)["there4"] = "\u2234"; (*em)["ntilde"] = "\u00f1"; (*em)["xi"] = "\u03be"; (*em)["dagger"] = "\u2020"; (*em)["egrave"] = "\u00e8"; (*em)["Delta"] = "\u0394"; (*em)["OElig"] = "\u0152"; (*em)["diams"] = "\u2666"; (*em)["ldquo"] = "\u201c"; (*em)["radic"] = "\u221a"; (*em)["Oslash"] = "\u00d8"; (*em)["Ouml"] = "\u00d6"; (*em)["lceil"] = "\u2308"; (*em)["uarr"] = "\u2191"; (*em)["atilde"] = "\u00e3"; (*em)["iquest"] = "\u00bf"; (*em)["lsaquo"] = "\u2039"; (*em)["Epsilon"] = "\u0395"; (*em)["iacute"] = "\u00ed"; (*em)["cap"] = "\u2229"; (*em)["deg"] = "\u00b0"; (*em)["Otilde"] = "\u00d5"; (*em)["zeta"] = "\u03b6"; (*em)["ocirc"] = "\u00f4"; (*em)["scaron"] = "\u0161"; (*em)["ecirc"] = "\u00ea"; (*em)["ordm"] = "\u00ba"; (*em)["tau"] = "\u03c4"; (*em)["Auml"] = "\u00c4"; (*em)["dArr"] = "\u21d3"; (*em)["ordf"] = "\u00aa"; (*em)["alefsym"] = "\u2135"; (*em)["notin"] = "\u2209"; (*em)["Pi"] = "\u03a0"; (*em)["sdot"] = "\u22c5"; (*em)["upsilon"] = "\u03c5"; (*em)["iota"] = "\u03b9"; (*em)["hArr"] = "\u21d4"; (*em)["Sigma"] = "\u03a3"; (*em)["lang"] = "\u2329"; (*em)["curren"] = "\u00a4"; (*em)["Theta"] = "\u0398"; (*em)["lArr"] = "\u21d0"; (*em)["Phi"] = "\u03a6"; (*em)["Nu"] = "\u039d"; (*em)["rho"] = "\u03c1"; (*em)["alpha"] = "\u03b1"; (*em)["iexcl"] = "\u00a1"; (*em)["micro"] = "\u00b5"; (*em)["cedil"] = "\u00b8"; (*em)["Ntilde"] = "\u00d1"; (*em)["Psi"] = "\u03a8"; (*em)["Dagger"] = "\u2021"; (*em)["Egrave"] = "\u00c8"; (*em)["Icirc"] = "\u00ce"; (*em)["nsub"] = "\u2284"; (*em)["bdquo"] = "\u201e"; (*em)["empty"] = "\u2205"; (*em)["aelig"] = "\u00e6"; (*em)["ograve"] = "\u00f2"; (*em)["macr"] = "\u00af"; (*em)["Zeta"] = "\u0396"; (*em)["beta"] = "\u03b2"; (*em)["sim"] = "\u223c"; (*em)["uuml"] = "\u00fc"; (*em)["Aacute"] = "\u00c1"; (*em)["Iacute"] = "\u00cd"; (*em)["exist"] = "\u2203"; (*em)["prime"] = "\u2032"; (*em)["rceil"] = "\u2309"; (*em)["real"] = "\u211c"; (*em)["zwnj"] = "\u200c"; (*em)["bull"] = "\u2022"; (*em)["quot"] = "\u0022"; (*em)["Scaron"] = "\u0160"; (*em)["ugrave"] = "\u00f9"; }