From 5bbe4dbe1099ff9e0c4adb36ebfd99e2b79ec72c Mon Sep 17 00:00:00 2001 From: jim teeuwen Date: Tue, 24 Nov 2009 17:49:27 +0100 Subject: [PATCH] added HtmlToUTF8() and UTF8ToHTML() functions for converting non-standard html entities. --- src/document.go | 2 +- src/entitymap.go | 52 +++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 48 insertions(+), 6 deletions(-) diff --git a/src/document.go b/src/document.go index 1cb6842..9fd7665 100644 --- a/src/document.go +++ b/src/document.go @@ -60,7 +60,7 @@ func New() *Document { // if need be, this method can be called to fill the map with the entire set // defined on http://www.w3.org/TR/html4/sgml/entities.html func (this *Document) LoadExtendedEntityMap() { - entitymap_load(&this.Entity); + loadNonStandardEntities(&this.Entity); } func (this *Document) String() string { diff --git a/src/entitymap.go b/src/entitymap.go index d51f46d..b6a016c 100644 --- a/src/entitymap.go +++ b/src/entitymap.go @@ -1,12 +1,54 @@ package xmlx +import "fmt" +import "utf8" +import "regexp" +import "strconv" + +var reg_entity = regexp.MustCompile("^&#[0-9]+;$"); + +// Converts a single numerical html entity to a regular Go utf-token. +// ex: "♣" -> "♣" +func HtmlToUTF8(entity string) string { + // Make sure we have a valid entity: { + ok := reg_entity.MatchString(entity); + if !ok { return "" } + + // Convert entity to number + num, err := strconv.Atoi(entity[2:len(entity)-1]); + if err != nil { return "" } + + var arr [3]byte; + size := utf8.EncodeRune(num, &arr); + if size == 0 { return "" } + + return string(&arr); +} + +// Converts a single Go utf-token to it's an Html entity. +// ex: "♣" -> "♣" +func UTF8ToHtml(token string) string { + rune, size := utf8.DecodeRuneInString(token); + if size == 0 { return "" } + return fmt.Sprintf("&#%d;", rune); +} + + /* - Portions © International Organization for Standardization 1986 - Permission to copy in any form is granted for use with - conforming SGML systems and applications as defined in - ISO 8879, provided this notice is included in all copies. + http://www.w3.org/TR/html4/sgml/entities.html + + Portions © International Organization for Standardization 1986 + Permission to copy in any form is granted for use with + conforming SGML systems and applications as defined in + ISO 8879, provided this notice is included in all copies. + + Fills the supplied map with html entities mapped to their Go utf8 + equivalents. This map can be assigned to xml.Parser.Entity + It will be used to map non-standard xml entities to a proper value. + If the parser encounters any unknown entities, it will throw a syntax + error and abort the parsing. Hence the ability to supply this map. */ -func entitymap_load(em *map[string]string) { +func loadNonStandardEntities(em *map[string]string) { // Generic entities string([]uint8{160}); (*em)["nbsp"] = " "; (*em)["iexcl"] = "¡";