diff --git a/LICENSE b/LICENSE index 6574112..3dab946 100644 --- a/LICENSE +++ b/LICENSE @@ -1,5 +1,5 @@ -Copyright (c) 2010, Jim Teeuwen. +Copyright (c) 2010-2011, Jim Teeuwen. All rights reserved. Redistribution and use in source and binary forms, with or without modification, diff --git a/README b/README index a197526..443cd0f 100644 --- a/README +++ b/README @@ -8,6 +8,12 @@ Nodes now simply become collections and don't require you to read them in the order in which the xml.Parser finds them. +================================================================================ + DEPENDENCIES +================================================================================ + + go-iconv: https://github.com/sloonz/go-iconv + ================================================================================ USAGE ================================================================================ diff --git a/xmlx/Makefile b/xmlx/Makefile index 2aa9b1c..6a9992f 100644 --- a/xmlx/Makefile +++ b/xmlx/Makefile @@ -1,6 +1,6 @@ include $(GOROOT)/src/Make.inc TARG = xmlx -GOFILES = document.go node.go io.go entitymap.go\ +GOFILES = document.go node.go entitymap.go include $(GOROOT)/src/Make.pkg diff --git a/xmlx/document.go b/xmlx/document.go index 1233e83..1e1959f 100644 --- a/xmlx/document.go +++ b/xmlx/document.go @@ -1,41 +1,40 @@ +// Copyright (c) 2010, Jim Teeuwen. All rights reserved. +// This code is subject to a 1-clause BSD license. +// The contents of which can be found in the LICENSE file. + /* -Copyright (c) 2010, Jim Teeuwen. -All rights reserved. - -This code is subject to a 1-clause BSD license. -The contents of which can be found in the LICENSE file. - - This package wraps the standard XML library and uses it to build a node tree of any document you load. This allows you to look up nodes forwards and backwards, - as well as perform search queries (no xpath support yet). + as well as perform simple search queries. Nodes now simply become collections and don't require you to read them in the order in which the xml.Parser finds them. - The Document currently implements 2 simple search functions which allow you to + The Document currently implements 2 search functions which allow you to look for specific nodes. - Document.SelectNode(namespace, name string) *Node; - Document.SelectNodes(namespace, name string) []*Node; + *xmlx.Document.SelectNode(namespace, name string) *Node; + *xmlx.Document.SelectNodes(namespace, name string) []*Node; SelectNode() returns the first, single node it finds matching the given name and namespace. SelectNodes() returns a slice containing all the matching nodes. Note that these search functions can be invoked on individual nodes as well. This allows you to search only a subset of the entire document. - */ package xmlx -import "os" -import "io" -import "io/ioutil" -import "path" -import "strings" -import "xml" -import "fmt" -import "http" +import ( + "os" + "io" + "io/ioutil" + "path" + "strings" + "xml" + "fmt" + "http" + "iconv" +) type Document struct { Version string @@ -82,6 +81,12 @@ func (this *Document) SelectNodes(namespace, name string) []*Node { // *** Satisfy ILoader interface // ***************************************************************************** func (this *Document) LoadString(s string) (err os.Error) { + // Ensure we are passing UTF-8 encoding content to the XML tokenizer. + if s, err = this.correctEncoding(s); err != nil { + return + } + + // tokenize data xp := xml.NewParser(strings.NewReader(s)) xp.Entity = this.Entity @@ -218,3 +223,48 @@ func (this *Document) SaveStream(w io.Writer) (err os.Error) { _, err = w.Write([]byte(s)) return } + +// Use libiconv to ensure we get UTF-8 encoded data. The Go Xml tokenizer will +// throw a tantrum if we give it anything else. +func (this *Document) correctEncoding(data string) (ret string, err os.Error) { + var cd *iconv.Iconv + var tok xml.Token + + enc := "utf-8" + xp := xml.NewParser(strings.NewReader(data)) + xp.Entity = this.Entity + +loop: + for { + if tok, err = xp.Token(); err != nil { + if err == os.EOF { + break loop + } + return "", err + } + + switch tt := tok.(type) { + case xml.ProcInst: + if tt.Target == "xml" { // xml doctype + enc = strings.ToLower(string(tt.Inst)) + if i := strings.Index(enc, `encoding="`); i > -1 { + enc = enc[i+len(`encoding="`):] + i = strings.Index(enc, `"`) + enc = enc[:i] + break loop + } + } + } + } + + if enc == "utf-8" { + return data, nil + } + + if cd, err = iconv.Open("utf-8", enc); err != nil { + return + } + + defer cd.Close() + return cd.Conv(data) +} diff --git a/xmlx/entitymap.go b/xmlx/entitymap.go index e58af3d..8c86fca 100644 --- a/xmlx/entitymap.go +++ b/xmlx/entitymap.go @@ -1,11 +1,6 @@ -/* -Copyright (c) 2010, Jim Teeuwen. -All rights reserved. - -This code is subject to a 1-clause BSD license. -The contents of which can be found in the LICENSE file. -*/ - +// Copyright (c) 2010, Jim Teeuwen. All rights reserved. +// This code is subject to a 1-clause BSD license. +// The contents of which can be found in the LICENSE file. package xmlx /* @@ -20,11 +15,13 @@ package xmlx "â" (â) is not the same as "Â" (Â). */ -import "os" -import "fmt" -import "utf8" -import "regexp" -import "strconv" +import ( + "os" + "fmt" + "utf8" + "regexp" + "strconv" +) var reg_entnumeric = regexp.MustCompile("^&#[0-9]+;$") var reg_entnamed = regexp.MustCompile("^&[a-zA-Z]+;$") diff --git a/xmlx/io.go b/xmlx/io.go deleted file mode 100644 index 828e7a5..0000000 --- a/xmlx/io.go +++ /dev/null @@ -1,30 +0,0 @@ -/* -Copyright (c) 2010, Jim Teeuwen. -All rights reserved. - -This code is subject to a 1-clause BSD license. -The contents of which can be found in the LICENSE file. -*/ - -package xmlx - -import "os" -import "io" - -type ILoader interface { - LoadUrl(string) os.Error - LoadFile(string) os.Error - LoadString(string) os.Error - LoadStream(*io.Reader) os.Error -} - -type ISaver interface { - SaveFile(string) os.Error - SaveString(string) (string, os.Error) - SaveStream(*io.Writer) os.Error -} - -type ILoaderSaver interface { - ILoader - ISaver -} diff --git a/xmlx/node.go b/xmlx/node.go index 832857a..ac03319 100644 --- a/xmlx/node.go +++ b/xmlx/node.go @@ -8,12 +8,14 @@ The contents of which can be found in the LICENSE file. package xmlx -import "os" -import "strings" -import "xml" -import "bytes" -import "fmt" -import "strconv" +import ( + "os" + "strings" + "xml" + "bytes" + "fmt" + "strconv" +) const ( NT_ROOT = iota diff --git a/xmlx/test.xml b/xmlx/test.xml index 455c17e..ddd248d 100644 --- a/xmlx/test.xml +++ b/xmlx/test.xml @@ -1,5 +1,5 @@ - + WriteTheWeb diff --git a/xmlx/xmlx_test.go b/xmlx/xmlx_test.go index e7258bf..f165a14 100644 --- a/xmlx/xmlx_test.go +++ b/xmlx/xmlx_test.go @@ -14,7 +14,7 @@ func TestLoadLocal(t *testing.T) { doc := New() if err := doc.LoadFile("test.xml"); err != nil { - t.Errorf("%s", err) + t.Error(err.String()) return } @@ -24,11 +24,11 @@ func TestLoadLocal(t *testing.T) { } } -func _TestLoadRemote(t *testing.T) { +func TestLoadRemote(t *testing.T) { doc := New() - if err := doc.LoadUri("http://www.w3schools.com/xml/plant_catalog.xml"); err != nil { - t.Errorf("%s", err) + if err := doc.LoadUri("http://rss.cnn.com/rss/cnn_latest.rss"); err != nil { + t.Error(err.String()) return }