Fix issue 1: Added dependency to go-iconv. this lib is needed to ensure we pass valud UTF-8 encoded data to the XML tokenizer.

This commit is contained in:
jim teeuwen 2011-01-18 21:31:56 +01:00
parent bacbff0e71
commit 02d19ed0bd
9 changed files with 101 additions and 76 deletions

View File

@ -1,5 +1,5 @@
Copyright (c) 2010, Jim Teeuwen. Copyright (c) 2010-2011, Jim Teeuwen.
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without modification, Redistribution and use in source and binary forms, with or without modification,

6
README
View File

@ -8,6 +8,12 @@
Nodes now simply become collections and don't require you to read them in the Nodes now simply become collections and don't require you to read them in the
order in which the xml.Parser finds them. order in which the xml.Parser finds them.
================================================================================
DEPENDENCIES
================================================================================
go-iconv: https://github.com/sloonz/go-iconv
================================================================================ ================================================================================
USAGE USAGE
================================================================================ ================================================================================

View File

@ -1,6 +1,6 @@
include $(GOROOT)/src/Make.inc include $(GOROOT)/src/Make.inc
TARG = xmlx TARG = xmlx
GOFILES = document.go node.go io.go entitymap.go\ GOFILES = document.go node.go entitymap.go
include $(GOROOT)/src/Make.pkg include $(GOROOT)/src/Make.pkg

View File

@ -1,41 +1,40 @@
// Copyright (c) 2010, Jim Teeuwen. All rights reserved.
// This code is subject to a 1-clause BSD license.
// The contents of which can be found in the LICENSE file.
/* /*
Copyright (c) 2010, Jim Teeuwen.
All rights reserved.
This code is subject to a 1-clause BSD license.
The contents of which can be found in the LICENSE file.
This package wraps the standard XML library and uses it to build a node tree of This package wraps the standard XML library and uses it to build a node tree of
any document you load. This allows you to look up nodes forwards and backwards, any document you load. This allows you to look up nodes forwards and backwards,
as well as perform search queries (no xpath support yet). as well as perform simple search queries.
Nodes now simply become collections and don't require you to read them in the Nodes now simply become collections and don't require you to read them in the
order in which the xml.Parser finds them. order in which the xml.Parser finds them.
The Document currently implements 2 simple search functions which allow you to The Document currently implements 2 search functions which allow you to
look for specific nodes. look for specific nodes.
Document.SelectNode(namespace, name string) *Node; *xmlx.Document.SelectNode(namespace, name string) *Node;
Document.SelectNodes(namespace, name string) []*Node; *xmlx.Document.SelectNodes(namespace, name string) []*Node;
SelectNode() returns the first, single node it finds matching the given name SelectNode() returns the first, single node it finds matching the given name
and namespace. SelectNodes() returns a slice containing all the matching nodes. and namespace. SelectNodes() returns a slice containing all the matching nodes.
Note that these search functions can be invoked on individual nodes as well. Note that these search functions can be invoked on individual nodes as well.
This allows you to search only a subset of the entire document. This allows you to search only a subset of the entire document.
*/ */
package xmlx package xmlx
import "os" import (
import "io" "os"
import "io/ioutil" "io"
import "path" "io/ioutil"
import "strings" "path"
import "xml" "strings"
import "fmt" "xml"
import "http" "fmt"
"http"
"iconv"
)
type Document struct { type Document struct {
Version string Version string
@ -82,6 +81,12 @@ func (this *Document) SelectNodes(namespace, name string) []*Node {
// *** Satisfy ILoader interface // *** Satisfy ILoader interface
// ***************************************************************************** // *****************************************************************************
func (this *Document) LoadString(s string) (err os.Error) { func (this *Document) LoadString(s string) (err os.Error) {
// Ensure we are passing UTF-8 encoding content to the XML tokenizer.
if s, err = this.correctEncoding(s); err != nil {
return
}
// tokenize data
xp := xml.NewParser(strings.NewReader(s)) xp := xml.NewParser(strings.NewReader(s))
xp.Entity = this.Entity xp.Entity = this.Entity
@ -218,3 +223,48 @@ func (this *Document) SaveStream(w io.Writer) (err os.Error) {
_, err = w.Write([]byte(s)) _, err = w.Write([]byte(s))
return return
} }
// Use libiconv to ensure we get UTF-8 encoded data. The Go Xml tokenizer will
// throw a tantrum if we give it anything else.
func (this *Document) correctEncoding(data string) (ret string, err os.Error) {
var cd *iconv.Iconv
var tok xml.Token
enc := "utf-8"
xp := xml.NewParser(strings.NewReader(data))
xp.Entity = this.Entity
loop:
for {
if tok, err = xp.Token(); err != nil {
if err == os.EOF {
break loop
}
return "", err
}
switch tt := tok.(type) {
case xml.ProcInst:
if tt.Target == "xml" { // xml doctype
enc = strings.ToLower(string(tt.Inst))
if i := strings.Index(enc, `encoding="`); i > -1 {
enc = enc[i+len(`encoding="`):]
i = strings.Index(enc, `"`)
enc = enc[:i]
break loop
}
}
}
}
if enc == "utf-8" {
return data, nil
}
if cd, err = iconv.Open("utf-8", enc); err != nil {
return
}
defer cd.Close()
return cd.Conv(data)
}

View File

@ -1,11 +1,6 @@
/* // Copyright (c) 2010, Jim Teeuwen. All rights reserved.
Copyright (c) 2010, Jim Teeuwen. // This code is subject to a 1-clause BSD license.
All rights reserved. // The contents of which can be found in the LICENSE file.
This code is subject to a 1-clause BSD license.
The contents of which can be found in the LICENSE file.
*/
package xmlx package xmlx
/* /*
@ -20,11 +15,13 @@ package xmlx
"â" (â) is not the same as "Â" (Â). "â" (â) is not the same as "Â" (Â).
*/ */
import "os" import (
import "fmt" "os"
import "utf8" "fmt"
import "regexp" "utf8"
import "strconv" "regexp"
"strconv"
)
var reg_entnumeric = regexp.MustCompile("^&#[0-9]+;$") var reg_entnumeric = regexp.MustCompile("^&#[0-9]+;$")
var reg_entnamed = regexp.MustCompile("^&[a-zA-Z]+;$") var reg_entnamed = regexp.MustCompile("^&[a-zA-Z]+;$")

View File

@ -1,30 +0,0 @@
/*
Copyright (c) 2010, Jim Teeuwen.
All rights reserved.
This code is subject to a 1-clause BSD license.
The contents of which can be found in the LICENSE file.
*/
package xmlx
import "os"
import "io"
type ILoader interface {
LoadUrl(string) os.Error
LoadFile(string) os.Error
LoadString(string) os.Error
LoadStream(*io.Reader) os.Error
}
type ISaver interface {
SaveFile(string) os.Error
SaveString(string) (string, os.Error)
SaveStream(*io.Writer) os.Error
}
type ILoaderSaver interface {
ILoader
ISaver
}

View File

@ -8,12 +8,14 @@ The contents of which can be found in the LICENSE file.
package xmlx package xmlx
import "os" import (
import "strings" "os"
import "xml" "strings"
import "bytes" "xml"
import "fmt" "bytes"
import "strconv" "fmt"
"strconv"
)
const ( const (
NT_ROOT = iota NT_ROOT = iota

View File

@ -1,5 +1,5 @@
<!DOCTYPE xml> <!DOCTYPE xml>
<?xml version="1.0" encoding="ISO-8859-1"?> <?xml version="1.0" encoding="utf-8"?>
<rss version="0.91"> <rss version="0.91">
<channel> <channel>
<title>WriteTheWeb</title> <title>WriteTheWeb</title>

View File

@ -14,7 +14,7 @@ func TestLoadLocal(t *testing.T) {
doc := New() doc := New()
if err := doc.LoadFile("test.xml"); err != nil { if err := doc.LoadFile("test.xml"); err != nil {
t.Errorf("%s", err) t.Error(err.String())
return return
} }
@ -24,11 +24,11 @@ func TestLoadLocal(t *testing.T) {
} }
} }
func _TestLoadRemote(t *testing.T) { func TestLoadRemote(t *testing.T) {
doc := New() doc := New()
if err := doc.LoadUri("http://www.w3schools.com/xml/plant_catalog.xml"); err != nil { if err := doc.LoadUri("http://rss.cnn.com/rss/cnn_latest.rss"); err != nil {
t.Errorf("%s", err) t.Error(err.String())
return return
} }