Got rid of depracated Node api functions. Removed dependency on go-iconv in favor of go-charset (See README). go-charset is a native Go package and thus requires no CGO functionality. Meaning go-pkg-xmlx and go-pkg-rss can now be used in Google AppEngine. Some speed and memory efficiency improvements added. Should now do a lot less []byte -> string conversions.

This commit is contained in:
jim teeuwen 2011-05-11 17:44:09 +02:00
parent 882ba8d0bb
commit ea7fc45209
5 changed files with 128 additions and 244 deletions

5
README
View File

@ -12,13 +12,14 @@
DEPENDENCIES
================================================================================
goinstall github.com/sloonz/go-iconv/src
goinstall go-charset.googlecode.com/hg/charset
================================================================================
USAGE
================================================================================
Getting the package up and running is simple enough:
Getting the package up and running is simple enough. This should also
automatically take care of any dependencies for you:
$ goinstall github.com/jteeuwen/go-pkg-xmlx

View File

@ -28,24 +28,26 @@ import (
"os"
"io"
"io/ioutil"
"path"
"strings"
"bytes"
"xml"
"fmt"
"http"
iconv "github.com/sloonz/go-iconv/src"
"go-charset.googlecode.com/hg/charset"
)
// represents a single XML document.
type Document struct {
Version string
Encoding string
StandAlone string
SaveDocType bool
Root *Node
Entity map[string]string
Verbose bool
Version string // XML version
Encoding string // Encoding found in document. If absent, assumes UTF-8.
StandAlone string // Value of XML doctype's 'standalone' attribute.
SaveDocType bool // Whether not to include the XML doctype in saves.
Root *Node // The document's root node.
Entity map[string]string // Mapping of custom entity conversions.
Verbose bool // [depracated] Not actually used anymore.
}
// Create a new, empty XML document instance.
func New() *Document {
return &Document{
Version: "1.0",
@ -53,7 +55,6 @@ func New() *Document {
StandAlone: "yes",
SaveDocType: true,
Entity: make(map[string]string),
Verbose: false,
}
}
@ -64,50 +65,38 @@ func New() *Document {
// defined on http://www.w3.org/TR/html4/sgml/entities.html
func (this *Document) LoadExtendedEntityMap() { loadNonStandardEntities(this.Entity) }
func (this *Document) String() string {
s, _ := this.SaveString()
return s
}
// Select a single node with the given namespace and name. Returns nil if no
// matching node was found.
func (this *Document) SelectNode(namespace, name string) *Node {
return this.Root.SelectNode(namespace, name)
}
// Select all nodes with the given namespace and name. Returns an empty slice
// if no matches were found.
func (this *Document) SelectNodes(namespace, name string) []*Node {
return this.Root.SelectNodes(namespace, name)
}
// *****************************************************************************
// *** Satisfy ILoader interface
// *****************************************************************************
func (this *Document) LoadString(s string) (err os.Error) {
// Ensure we are passing UTF-8 encoding content to the XML tokenizer.
if s, err = this.correctEncoding(s); err != nil {
return
}
// tokenize data
xp := xml.NewParser(strings.NewReader(s))
// Load the contents of this document from the supplied reader.
func (this *Document) LoadStream(r io.Reader) (err os.Error) {
xp := xml.NewParser(r)
xp.Entity = this.Entity
xp.CharsetReader = func(enc string, input io.Reader) (io.Reader, os.Error) {
return charset.NewReader(enc, input)
}
this.Root = NewNode(NT_ROOT)
ct := this.Root
var tok xml.Token
var t *Node
var i int
var doctype string
var v xml.Attr
for {
if tok, err = xp.Token(); err != nil {
if err == os.EOF {
return nil
}
if this.Verbose {
fmt.Fprintf(os.Stderr, "Xml Error: %s\n", err)
}
return err
}
@ -128,7 +117,7 @@ func (this *Document) LoadString(s string) (err os.Error) {
t = NewNode(NT_ELEMENT)
t.Name = tt.Name
t.Attributes = make([]*Attr, len(tt.Attr))
for i, v = range tt.Attr {
for i, v := range tt.Attr {
t.Attributes[i] = new(Attr)
t.Attributes[i].Name = v.Name
t.Attributes[i].Value = v.Value
@ -138,7 +127,7 @@ func (this *Document) LoadString(s string) (err os.Error) {
case xml.ProcInst:
if tt.Target == "xml" { // xml doctype
doctype = strings.TrimSpace(string(tt.Inst))
if i = strings.Index(doctype, `standalone="`); i > -1 {
if i := strings.Index(doctype, `standalone="`); i > -1 {
this.StandAlone = doctype[i+len(`standalone="`) : len(doctype)]
i = strings.Index(this.StandAlone, `"`)
this.StandAlone = this.StandAlone[0:i]
@ -159,16 +148,28 @@ func (this *Document) LoadString(s string) (err os.Error) {
return
}
func (this *Document) LoadFile(filename string) (err os.Error) {
var data []byte
// Load the contents of this document from the supplied byte slice.
func (this *Document) LoadBytes(d []byte) (err os.Error) {
return this.LoadStream(bytes.NewBuffer(d))
}
if data, err = ioutil.ReadFile(path.Clean(filename)); err != nil {
// Load the contents of this document from the supplied string.
func (this *Document) LoadString(s string) (err os.Error) {
return this.LoadStream(strings.NewReader(s))
}
// Load the contents of this document from the supplied file.
func (this *Document) LoadFile(filename string) (err os.Error) {
var fd *os.File
if fd, err = os.Open(filename); err != nil {
return
}
return this.LoadString(string(data))
defer fd.Close()
return this.LoadStream(fd)
}
// Load the contents of this document from the supplied uri.
func (this *Document) LoadUri(uri string) (err os.Error) {
var r *http.Response
if r, _, err = http.Get(uri); err != nil {
@ -176,105 +177,36 @@ func (this *Document) LoadUri(uri string) (err os.Error) {
}
defer r.Body.Close()
var b []byte
if b, err = ioutil.ReadAll(r.Body); err != nil {
return
return this.LoadStream(r.Body)
}
return this.LoadString(string(b))
// Save the contents of this document to the supplied file.
func (this *Document) SaveFile(path string) os.Error {
return ioutil.WriteFile(path, this.SaveBytes(), 0600)
}
func (this *Document) LoadStream(r io.Reader) (err os.Error) {
var b []byte
if b, err = ioutil.ReadAll(r); err != nil {
return
}
return this.LoadString(string(b))
}
// Save the contents of this document as a byte slice.
func (this *Document) SaveBytes() []byte {
var b bytes.Buffer
// *****************************************************************************
// *** Satisfy ISaver interface
// *****************************************************************************
func (this *Document) SaveFile(path string) (err os.Error) {
var data string
if data, err = this.SaveString(); err != nil {
return
}
return ioutil.WriteFile(path, []byte(data), 0600)
}
func (this *Document) SaveString() (s string, err os.Error) {
if this.SaveDocType {
s = fmt.Sprintf(`<?xml version="%s" encoding="%s" standalone="%s"?>`,
this.Version, this.Encoding, this.StandAlone)
b.WriteString(fmt.Sprintf(`<?xml version="%s" encoding="%s" standalone="%s"?>`,
this.Version, this.Encoding, this.StandAlone))
}
s += this.Root.String()
return
b.Write(this.Root.Bytes())
return b.Bytes()
}
// Save the contents of this document as a string.
func (this *Document) SaveString() string { return string(this.SaveBytes()) }
// Alias for Document.SaveString(). This one is invoked by anything looking for
// the standard String() method (eg: fmt.Printf("%s\n", mydoc).
func (this *Document) String() string { return string(this.SaveBytes()) }
// Save the contents of this document to the supplied writer.
func (this *Document) SaveStream(w io.Writer) (err os.Error) {
var s string
if s, err = this.SaveString(); err != nil {
_, err = w.Write(this.SaveBytes())
return
}
_, err = w.Write([]byte(s))
return
}
// Use libiconv to ensure we get UTF-8 encoded data. The Go Xml tokenizer will
// throw a tantrum if we give it anything else.
func (this *Document) correctEncoding(data string) (ret string, err os.Error) {
var cd *iconv.Iconv
var tok xml.Token
enc := "utf-8"
xp := xml.NewParser(strings.NewReader(data))
xp.Entity = this.Entity
loop:
for {
if tok, err = xp.Token(); err != nil {
if err == os.EOF {
break loop
}
return "", err
}
switch tt := tok.(type) {
case xml.ProcInst:
if tt.Target == "xml" { // xml doctype
var pair []string
var entry string
list := strings.Split(string(tt.Inst), " ", -1)
for _, entry = range list {
if pair = strings.Split(entry, "=", -1); len(pair) < 2 {
continue
}
switch pair[0] {
case "encoding":
enc = pair[1][1 : len(pair[1])-1]
break loop
}
}
}
}
}
if strings.ToLower(enc) == "utf-8" {
// Data already in utf-8 format. Nothing to do here.
return data, nil
}
if cd, err = iconv.Open("utf-8", enc); err != nil {
return
}
defer cd.Close()
return cd.Conv(data)
}

178
node.go
View File

@ -6,7 +6,6 @@ package xmlx
import (
"os"
"strings"
"xml"
"bytes"
"fmt"
@ -22,18 +21,18 @@ const (
)
type Attr struct {
Name xml.Name
Value string
Name xml.Name // Attribute namespace and name.
Value string // Attribute value.
}
type Node struct {
Type byte
Name xml.Name
Children []*Node
Attributes []*Attr
Parent *Node
Value string
Target string // procinst field
Type byte // Node type.
Name xml.Name // Node namespace and name.
Children []*Node // Child nodes.
Attributes []*Attr // Node attributes.
Parent *Node // Parent node.
Value string // Node value.
Target string // procinst field.
}
func NewNode(tid byte) *Node {
@ -47,7 +46,7 @@ func NewNode(tid byte) *Node {
// This wraps the standard xml.Unmarshal function and supplies this particular
// node as the content to be unmarshalled.
func (this *Node) Unmarshal(obj interface{}) os.Error {
return xml.Unmarshal(strings.NewReader(this.String()), obj)
return xml.Unmarshal(bytes.NewBuffer(this.Bytes()), obj)
}
// Get node value as string
@ -58,10 +57,6 @@ func (this *Node) S(namespace, name string) string {
return ""
}
// Deprecated - use Node.S()
func (this *Node) GetValue(namespace, name string) string { return this.S(namespace, name) }
// Get node value as int
func (this *Node) I(namespace, name string) int {
if node := rec_SelectNode(this, namespace, name); node != nil && node.Value != "" {
@ -71,10 +66,6 @@ func (this *Node) I(namespace, name string) int {
return 0
}
// Deprecated - use Node.I()
func (this *Node) GetValuei(namespace, name string) int { return this.I(namespace, name) }
// Get node value as int64
func (this *Node) I64(namespace, name string) int64 {
if node := rec_SelectNode(this, namespace, name); node != nil && node.Value != "" {
@ -84,10 +75,6 @@ func (this *Node) I64(namespace, name string) int64 {
return 0
}
// Deprecated - use Node.I64()
func (this *Node) GetValuei64(namespace, name string) int64 { return this.I64(namespace, name) }
// Get node value as uint
func (this *Node) U(namespace, name string) uint {
if node := rec_SelectNode(this, namespace, name); node != nil && node.Value != "" {
@ -97,10 +84,6 @@ func (this *Node) U(namespace, name string) uint {
return 0
}
// Deprecated - use Node.U()
func (this *Node) GetValueui(namespace, name string) uint { return this.U(namespace, name) }
// Get node value as uint64
func (this *Node) U64(namespace, name string) uint64 {
if node := rec_SelectNode(this, namespace, name); node != nil && node.Value != "" {
@ -110,9 +93,6 @@ func (this *Node) U64(namespace, name string) uint64 {
return 0
}
// Deprecated - use Node.U64()
func (this *Node) GetValueui64(namespace, name string) uint64 { return this.U64(namespace, name) }
// Get node value as float32
func (this *Node) F32(namespace, name string) float32 {
if node := rec_SelectNode(this, namespace, name); node != nil && node.Value != "" {
@ -122,10 +102,6 @@ func (this *Node) F32(namespace, name string) float32 {
return 0
}
// Deprecated - use Node.F32()
func (this *Node) GetValuef32(namespace, name string) float32 { return this.F32(namespace, name) }
// Get node value as float64
func (this *Node) F64(namespace, name string) float64 {
if node := rec_SelectNode(this, namespace, name); node != nil && node.Value != "" {
@ -135,10 +111,6 @@ func (this *Node) F64(namespace, name string) float64 {
return 0
}
// Deprecated - use Node.F64()
func (this *Node) GetValuef64(namespace, name string) float64 { return this.F64(namespace, name) }
// Get node value as bool
func (this *Node) B(namespace, name string) bool {
if node := rec_SelectNode(this, namespace, name); node != nil && node.Value != "" {
@ -158,10 +130,6 @@ func (this *Node) As(namespace, name string) string {
return ""
}
// Deprecated - use Node.As()
func (this *Node) GetAttr(namespace, name string) string { return this.As(namespace, name) }
// Get attribute value as int
func (this *Node) Ai(namespace, name string) int {
if s := this.As(namespace, name); s != "" {
@ -171,10 +139,6 @@ func (this *Node) Ai(namespace, name string) int {
return 0
}
// Deprecated - use Node.Ai()
func (this *Node) GetAttri(namespace, name string) int { return this.Ai(namespace, name) }
// Get attribute value as uint
func (this *Node) Au(namespace, name string) uint {
if s := this.As(namespace, name); s != "" {
@ -184,10 +148,6 @@ func (this *Node) Au(namespace, name string) uint {
return 0
}
// Deprecated - use Node.Au()
func (this *Node) GetAttrui(namespace, name string) uint { return this.Au(namespace, name) }
// Get attribute value as uint64
func (this *Node) Au64(namespace, name string) uint64 {
if s := this.As(namespace, name); s != "" {
@ -197,10 +157,6 @@ func (this *Node) Au64(namespace, name string) uint64 {
return 0
}
// Deprecated - use Node.Au64()
func (this *Node) GetAttrui64(namespace, name string) uint64 { return this.Au64(namespace, name) }
// Get attribute value as int64
func (this *Node) Ai64(namespace, name string) int64 {
if s := this.As(namespace, name); s != "" {
@ -210,9 +166,6 @@ func (this *Node) Ai64(namespace, name string) int64 {
return 0
}
// Deprecated - use Node.Ai64()
func (this *Node) GetAttri64(namespace, name string) int64 { return this.Ai64(namespace, name) }
// Get attribute value as float32
func (this *Node) Af32(namespace, name string) float32 {
if s := this.As(namespace, name); s != "" {
@ -222,9 +175,6 @@ func (this *Node) Af32(namespace, name string) float32 {
return 0
}
// Deprecated - use Node.Af32()
func (this *Node) GetAttrf32(namespace, name string) float32 { return this.Af32(namespace, name) }
// Get attribute value as float64
func (this *Node) Af64(namespace, name string) float64 {
if s := this.As(namespace, name); s != "" {
@ -234,10 +184,6 @@ func (this *Node) Af64(namespace, name string) float64 {
return 0
}
// Deprecated - use Node.Af64()
func (this *Node) GetAttrf64(namespace, name string) float64 { return this.Af64(namespace, name) }
// Get attribute value as bool
func (this *Node) Ab(namespace, name string) bool {
if s := this.As(namespace, name); s != "" {
@ -298,94 +244,100 @@ func rec_SelectNodes(cn *Node, namespace, name string, list *[]*Node) {
}
}
// Convert node to appropriate []byte representation based on it's @Type.
// Note that NT_ROOT is a special-case empty node used as the root for a
// Document. This one has no representation by itself. It merely forwards the
// String() call to it's child nodes.
func (this *Node) Bytes() (b []byte) {
switch this.Type {
case NT_PROCINST:
b = this.printProcInst()
case NT_COMMENT:
b = this.printComment()
case NT_DIRECTIVE:
b = this.printDirective()
case NT_ELEMENT:
b = this.printElement()
case NT_ROOT:
b = this.printRoot()
}
return
}
// Convert node to appropriate string representation based on it's @Type.
// Note that NT_ROOT is a special-case empty node used as the root for a
// Document. This one has no representation by itself. It merely forwards the
// String() call to it's child nodes.
func (this *Node) String() (s string) {
switch this.Type {
case NT_PROCINST:
s = this.printProcInst()
case NT_COMMENT:
s = this.printComment()
case NT_DIRECTIVE:
s = this.printDirective()
case NT_ELEMENT:
s = this.printElement()
case NT_ROOT:
s = this.printRoot()
}
return
return string(this.Bytes())
}
func (this *Node) printRoot() (s string) {
var data []byte
buf := bytes.NewBuffer(data)
func (this *Node) printRoot() []byte {
var b bytes.Buffer
for _, v := range this.Children {
buf.WriteString(v.String())
b.WriteString(v.String())
}
return buf.String()
return b.Bytes()
}
func (this *Node) printProcInst() string {
return "<?" + this.Target + " " + this.Value + "?>"
func (this *Node) printProcInst() []byte {
return []byte("<?" + this.Target + " " + this.Value + "?>")
}
func (this *Node) printComment() string {
return "<!-- " + this.Value + " -->"
func (this *Node) printComment() []byte {
return []byte("<!-- " + this.Value + " -->")
}
func (this *Node) printDirective() string {
return "<!" + this.Value + "!>"
func (this *Node) printDirective() []byte {
return []byte("<!" + this.Value + "!>")
}
func (this *Node) printElement() string {
var data []byte
buf := bytes.NewBuffer(data)
func (this *Node) printElement() []byte {
var b bytes.Buffer
if len(this.Name.Space) > 0 {
buf.WriteRune('<')
buf.WriteString(this.Name.Space)
buf.WriteRune(':')
buf.WriteString(this.Name.Local)
b.WriteRune('<')
b.WriteString(this.Name.Space)
b.WriteRune(':')
b.WriteString(this.Name.Local)
} else {
buf.WriteRune('<')
buf.WriteString(this.Name.Local)
b.WriteRune('<')
b.WriteString(this.Name.Local)
}
for _, v := range this.Attributes {
if len(v.Name.Space) > 0 {
buf.WriteString(fmt.Sprintf(` %s:%s="%s"`, v.Name.Space, v.Name.Local, v.Value))
b.WriteString(fmt.Sprintf(` %s:%s="%s"`, v.Name.Space, v.Name.Local, v.Value))
} else {
buf.WriteString(fmt.Sprintf(` %s="%s"`, v.Name.Local, v.Value))
b.WriteString(fmt.Sprintf(` %s="%s"`, v.Name.Local, v.Value))
}
}
if len(this.Children) == 0 && len(this.Value) == 0 {
buf.WriteString(" />")
return buf.String()
b.WriteString(" />")
return b.Bytes()
}
buf.WriteRune('>')
b.WriteRune('>')
for _, v := range this.Children {
buf.WriteString(v.String())
b.WriteString(v.String())
}
buf.WriteString(this.Value)
b.WriteString(this.Value)
if len(this.Name.Space) > 0 {
buf.WriteString("</")
buf.WriteString(this.Name.Space)
buf.WriteRune(':')
buf.WriteString(this.Name.Local)
buf.WriteRune('>')
b.WriteString("</")
b.WriteString(this.Name.Space)
b.WriteRune(':')
b.WriteString(this.Name.Local)
b.WriteRune('>')
} else {
buf.WriteString("</")
buf.WriteString(this.Name.Local)
buf.WriteRune('>')
b.WriteString("</")
b.WriteString(this.Name.Local)
b.WriteRune('>')
}
return buf.String()
return b.Bytes()
}
// Add a child node

View File

@ -1,4 +1,3 @@
<!DOCTYPE xml>
<?xml version="1.0" encoding="utf-8"?>
<rss version="0.91">
<channel>

View File

@ -1 +1 @@
<?xml version="1.0" encoding="utf-8" standalone="yes"?><!DOCTYPE xml!><rss version="0.91"><channel><title>WriteTheWeb</title><link>http://writetheweb.com</link><description>News for web users that write back</description><language>en-us</language><copyright>Copyright 2000, WriteTheWeb team.</copyright><managingEditor>editor@writetheweb.com</managingEditor><webMaster>webmaster@writetheweb.com</webMaster><image><title>WriteTheWeb</title><url>http://writetheweb.com/images/mynetscape88.gif</url><link>http://writetheweb.com</link><width>88</width><height>31</height><description>News for web users that write back</description></image><item><title>Giving the world a pluggable Gnutella</title><link>http://writetheweb.com/read.php?item=24</link><description>WorldOS is a framework on which to build programs that work like Freenet or Gnutella -allowing distributed applications using peer-to-peer routing.</description></item><item><title>Syndication discussions hot up</title><link>http://writetheweb.com/read.php?item=23</link><description>After a period of dormancy, the Syndication mailing list has become active again, with contributions from leaders in traditional media and Web syndication.</description></item><item><title>Personal web server integrates file sharing and messaging</title><link>http://writetheweb.com/read.php?item=22</link><description>The Magi Project is an innovative project to create a combined personal web server and messaging system that enables the sharing and synchronization of information across desktop, laptop and palmtop devices.</description></item><item><title>Syndication and Metadata</title><link>http://writetheweb.com/read.php?item=21</link><description>RSS is probably the best known metadata format around. RDF is probably one of the least understood. In this essay, published on my O'Reilly Network weblog, I argue that the next generation of RSS should be based on RDF.</description></item><item><title>UK bloggers get organised</title><link>http://writetheweb.com/read.php?item=20</link><description>Looks like the weblogs scene is gathering pace beyond the shores of the US. There's now a UK-specific page on weblogs.com, and a mailing list at egroups.</description></item><item><title>Yournamehere.com more important than anything</title><link>http://writetheweb.com/read.php?item=19</link><description>Whatever you're publishing on the web, your site name is the most valuable asset you have, according to Carl Steadman.</description></item></channel></rss>
<?xml version="1.0" encoding="utf-8" standalone="yes"?><rss version="0.91"><channel><title>WriteTheWeb</title><link>http://writetheweb.com</link><description>News for web users that write back</description><language>en-us</language><copyright>Copyright 2000, WriteTheWeb team.</copyright><managingEditor>editor@writetheweb.com</managingEditor><webMaster>webmaster@writetheweb.com</webMaster><image><title>WriteTheWeb</title><url>http://writetheweb.com/images/mynetscape88.gif</url><link>http://writetheweb.com</link><width>88</width><height>31</height><description>News for web users that write back</description></image><item><title>Giving the world a pluggable Gnutella</title><link>http://writetheweb.com/read.php?item=24</link><description>WorldOS is a framework on which to build programs that work like Freenet or Gnutella -allowing distributed applications using peer-to-peer routing.</description></item><item><title>Syndication discussions hot up</title><link>http://writetheweb.com/read.php?item=23</link><description>After a period of dormancy, the Syndication mailing list has become active again, with contributions from leaders in traditional media and Web syndication.</description></item><item><title>Personal web server integrates file sharing and messaging</title><link>http://writetheweb.com/read.php?item=22</link><description>The Magi Project is an innovative project to create a combined personal web server and messaging system that enables the sharing and synchronization of information across desktop, laptop and palmtop devices.</description></item><item><title>Syndication and Metadata</title><link>http://writetheweb.com/read.php?item=21</link><description>RSS is probably the best known metadata format around. RDF is probably one of the least understood. In this essay, published on my O'Reilly Network weblog, I argue that the next generation of RSS should be based on RDF.</description></item><item><title>UK bloggers get organised</title><link>http://writetheweb.com/read.php?item=20</link><description>Looks like the weblogs scene is gathering pace beyond the shores of the US. There's now a UK-specific page on weblogs.com, and a mailing list at egroups.</description></item><item><title>Yournamehere.com more important than anything</title><link>http://writetheweb.com/read.php?item=19</link><description>Whatever you're publishing on the web, your site name is the most valuable asset you have, according to Carl Steadman.</description></item></channel></rss>