Fix issue 1: Added dependency to go-iconv. this lib is needed to ensure we pass valud UTF-8 encoded data to the XML tokenizer.

This commit is contained in:
jim teeuwen 2011-01-18 21:31:56 +01:00
parent bacbff0e71
commit 02d19ed0bd
9 changed files with 101 additions and 76 deletions

View File

@ -1,5 +1,5 @@
Copyright (c) 2010, Jim Teeuwen.
Copyright (c) 2010-2011, Jim Teeuwen.
All rights reserved.
Redistribution and use in source and binary forms, with or without modification,

6
README
View File

@ -8,6 +8,12 @@
Nodes now simply become collections and don't require you to read them in the
order in which the xml.Parser finds them.
================================================================================
DEPENDENCIES
================================================================================
go-iconv: https://github.com/sloonz/go-iconv
================================================================================
USAGE
================================================================================

View File

@ -1,6 +1,6 @@
include $(GOROOT)/src/Make.inc
TARG = xmlx
GOFILES = document.go node.go io.go entitymap.go\
GOFILES = document.go node.go entitymap.go
include $(GOROOT)/src/Make.pkg

View File

@ -1,41 +1,40 @@
// Copyright (c) 2010, Jim Teeuwen. All rights reserved.
// This code is subject to a 1-clause BSD license.
// The contents of which can be found in the LICENSE file.
/*
Copyright (c) 2010, Jim Teeuwen.
All rights reserved.
This code is subject to a 1-clause BSD license.
The contents of which can be found in the LICENSE file.
This package wraps the standard XML library and uses it to build a node tree of
any document you load. This allows you to look up nodes forwards and backwards,
as well as perform search queries (no xpath support yet).
as well as perform simple search queries.
Nodes now simply become collections and don't require you to read them in the
order in which the xml.Parser finds them.
The Document currently implements 2 simple search functions which allow you to
The Document currently implements 2 search functions which allow you to
look for specific nodes.
Document.SelectNode(namespace, name string) *Node;
Document.SelectNodes(namespace, name string) []*Node;
*xmlx.Document.SelectNode(namespace, name string) *Node;
*xmlx.Document.SelectNodes(namespace, name string) []*Node;
SelectNode() returns the first, single node it finds matching the given name
and namespace. SelectNodes() returns a slice containing all the matching nodes.
Note that these search functions can be invoked on individual nodes as well.
This allows you to search only a subset of the entire document.
*/
package xmlx
import "os"
import "io"
import "io/ioutil"
import "path"
import "strings"
import "xml"
import "fmt"
import "http"
import (
"os"
"io"
"io/ioutil"
"path"
"strings"
"xml"
"fmt"
"http"
"iconv"
)
type Document struct {
Version string
@ -82,6 +81,12 @@ func (this *Document) SelectNodes(namespace, name string) []*Node {
// *** Satisfy ILoader interface
// *****************************************************************************
func (this *Document) LoadString(s string) (err os.Error) {
// Ensure we are passing UTF-8 encoding content to the XML tokenizer.
if s, err = this.correctEncoding(s); err != nil {
return
}
// tokenize data
xp := xml.NewParser(strings.NewReader(s))
xp.Entity = this.Entity
@ -218,3 +223,48 @@ func (this *Document) SaveStream(w io.Writer) (err os.Error) {
_, err = w.Write([]byte(s))
return
}
// Use libiconv to ensure we get UTF-8 encoded data. The Go Xml tokenizer will
// throw a tantrum if we give it anything else.
func (this *Document) correctEncoding(data string) (ret string, err os.Error) {
var cd *iconv.Iconv
var tok xml.Token
enc := "utf-8"
xp := xml.NewParser(strings.NewReader(data))
xp.Entity = this.Entity
loop:
for {
if tok, err = xp.Token(); err != nil {
if err == os.EOF {
break loop
}
return "", err
}
switch tt := tok.(type) {
case xml.ProcInst:
if tt.Target == "xml" { // xml doctype
enc = strings.ToLower(string(tt.Inst))
if i := strings.Index(enc, `encoding="`); i > -1 {
enc = enc[i+len(`encoding="`):]
i = strings.Index(enc, `"`)
enc = enc[:i]
break loop
}
}
}
}
if enc == "utf-8" {
return data, nil
}
if cd, err = iconv.Open("utf-8", enc); err != nil {
return
}
defer cd.Close()
return cd.Conv(data)
}

View File

@ -1,11 +1,6 @@
/*
Copyright (c) 2010, Jim Teeuwen.
All rights reserved.
This code is subject to a 1-clause BSD license.
The contents of which can be found in the LICENSE file.
*/
// Copyright (c) 2010, Jim Teeuwen. All rights reserved.
// This code is subject to a 1-clause BSD license.
// The contents of which can be found in the LICENSE file.
package xmlx
/*
@ -20,11 +15,13 @@ package xmlx
"â" (â) is not the same as "Â" (Â).
*/
import "os"
import "fmt"
import "utf8"
import "regexp"
import "strconv"
import (
"os"
"fmt"
"utf8"
"regexp"
"strconv"
)
var reg_entnumeric = regexp.MustCompile("^&#[0-9]+;$")
var reg_entnamed = regexp.MustCompile("^&[a-zA-Z]+;$")

View File

@ -1,30 +0,0 @@
/*
Copyright (c) 2010, Jim Teeuwen.
All rights reserved.
This code is subject to a 1-clause BSD license.
The contents of which can be found in the LICENSE file.
*/
package xmlx
import "os"
import "io"
type ILoader interface {
LoadUrl(string) os.Error
LoadFile(string) os.Error
LoadString(string) os.Error
LoadStream(*io.Reader) os.Error
}
type ISaver interface {
SaveFile(string) os.Error
SaveString(string) (string, os.Error)
SaveStream(*io.Writer) os.Error
}
type ILoaderSaver interface {
ILoader
ISaver
}

View File

@ -8,12 +8,14 @@ The contents of which can be found in the LICENSE file.
package xmlx
import "os"
import "strings"
import "xml"
import "bytes"
import "fmt"
import "strconv"
import (
"os"
"strings"
"xml"
"bytes"
"fmt"
"strconv"
)
const (
NT_ROOT = iota

View File

@ -1,5 +1,5 @@
<!DOCTYPE xml>
<?xml version="1.0" encoding="ISO-8859-1"?>
<?xml version="1.0" encoding="utf-8"?>
<rss version="0.91">
<channel>
<title>WriteTheWeb</title>

View File

@ -14,7 +14,7 @@ func TestLoadLocal(t *testing.T) {
doc := New()
if err := doc.LoadFile("test.xml"); err != nil {
t.Errorf("%s", err)
t.Error(err.String())
return
}
@ -24,11 +24,11 @@ func TestLoadLocal(t *testing.T) {
}
}
func _TestLoadRemote(t *testing.T) {
func TestLoadRemote(t *testing.T) {
doc := New()
if err := doc.LoadUri("http://www.w3schools.com/xml/plant_catalog.xml"); err != nil {
t.Errorf("%s", err)
if err := doc.LoadUri("http://rss.cnn.com/rss/cnn_latest.rss"); err != nil {
t.Error(err.String())
return
}