Time parsing.

Rather than just using a string for PubDate, we attempt to parse it.
This includes a couple of crazy non-standard time formats that I've seen
in the wild.

Breaking change: Item.PubDate is no longer a string, it is time.Time.
This commit is contained in:
Sean Schulte 2014-03-24 21:54:15 -05:00
parent 2b6dc03ede
commit 2c67b94a04
5 changed files with 135 additions and 5 deletions

View File

@ -56,7 +56,7 @@ func (this *Feed) readAtom(doc *xmlx.Document) (err error) {
i = new(Item) i = new(Item)
i.Title = item.S(ns, "title") i.Title = item.S(ns, "title")
i.Id = item.S(ns, "id") i.Id = item.S(ns, "id")
i.PubDate = item.S(ns, "updated") i.PubDate, _ = parseTime(item.S(ns, "updated"))
i.Description = item.S(ns, "summary") i.Description = item.S(ns, "summary")
links := item.SelectNodes(ns, "link") links := item.SelectNodes(ns, "link")

View File

@ -3,6 +3,7 @@ package feeder
import ( import (
"crypto/md5" "crypto/md5"
"io" "io"
"time"
) )
type Item struct { type Item struct {
@ -15,7 +16,7 @@ type Item struct {
Comments string Comments string
Enclosures []*Enclosure Enclosures []*Enclosure
Guid *string Guid *string
PubDate string PubDate time.Time
Source *Source Source *Source
// Atom specific fields // Atom specific fields
@ -33,8 +34,8 @@ func (i *Item) Key() string {
return *i.Guid return *i.Guid
case len(i.Id) != 0: case len(i.Id) != 0:
return i.Id return i.Id
case len(i.Title) > 0 && len(i.PubDate) > 0: case len(i.Title) > 0 && !i.PubDate.IsZero():
return i.Title + i.PubDate return i.Title + i.PubDate.String()
default: default:
h := md5.New() h := md5.New()
io.WriteString(h, i.Description) io.WriteString(h, i.Description)

2
rss.go
View File

@ -162,7 +162,7 @@ func (this *Feed) readRss2(doc *xmlx.Document) (err error) {
i.Guid = &guid i.Guid = &guid
} }
i.PubDate = item.S(ns, "pubDate") i.PubDate, _ = parseTime(item.S(ns, "pubDate"))
tl = item.SelectNodes(ns, "category") tl = item.SelectNodes(ns, "category")
for _, lv := range tl { for _, lv := range tl {

35
timeparser.go Normal file
View File

@ -0,0 +1,35 @@
package feeder
import (
"strings"
"time"
)
func parseTime(formatted string) (time.Time, error) {
var layouts = [...]string{
"Mon, _2 Jan 2006 15:04:05 MST",
"Mon, _2 Jan 2006 15:04:05 -0700",
time.ANSIC,
time.UnixDate,
time.RubyDate,
time.RFC822,
time.RFC822Z,
time.RFC850,
time.RFC1123,
time.RFC1123Z,
time.RFC3339,
time.RFC3339Nano,
"Mon, 2, Jan 2006 15:4",
"02 Jan 2006 15:04:05 MST",
}
var t time.Time
var err error
formatted = strings.TrimSpace(formatted)
for _, layout := range layouts {
t, err = time.Parse(layout, formatted)
if !t.IsZero() {
break
}
}
return t, err
}

94
timeparser_test.go Normal file
View File

@ -0,0 +1,94 @@
package feeder
import (
"time"
"testing"
)
func Test_InvalidDate(t *testing.T) {
date, err := parseTime("invalid")
if !date.IsZero() {
t.Errorf("Invalid date should parse to zero")
}
if err == nil {
t.Errorf("error should not be nil")
}
}
func Test_ParseLayout0(t *testing.T) {
date, err := parseTime("2014-03-07T05:38:00-05:00")
expected := time.Date(2014, time.March, 7, 5, 38, 0, 0, time.FixedZone("-0500", -18000))
assertEqualTime(t, expected, date)
if err != nil {
t.Errorf("err should be nil")
}
}
func Test_ParseLayout1(t *testing.T) {
date, err := parseTime("Fri, 07 Mar 2014 17:42:51 GMT")
expected := time.Date(2014, time.March, 7, 17, 42, 51, 0, time.UTC)
assertEqualTime(t, expected, date)
if err != nil {
t.Errorf("err should be nil")
}
}
func Test_ParseLayout2(t *testing.T) {
date, err := parseTime("2014-02-05T23:33:34Z")
expected := time.Date(2014, time.February, 5, 23, 33, 34, 0, time.UTC)
assertEqualTime(t, expected, date)
if err != nil {
t.Errorf("err should be nil")
}
}
func Test_ParseLayout3(t *testing.T) {
date, err := parseTime("Mon, 03 Mar 2014 02:12:25 +0000")
expected := time.Date(2014, time.March, 3, 2, 12, 25, 0, time.UTC)
assertEqualTime(t, expected, date)
if err != nil {
t.Errorf("err should be nil")
}
}
func Test_ParseLayout4(t *testing.T) {
date, err := parseTime("Fri, 21, Mar 2014 10:41")
expected := time.Date(2014, time.March, 21, 10, 41, 0, 0, time.UTC)
assertEqualTime(t, expected, date)
if err != nil {
t.Errorf("err should be nil")
}
}
func Test_ParseLayout4_1(t *testing.T) {
date, err := parseTime("Fri, 17, Jan 2014 11:1")
expected := time.Date(2014, time.January, 17, 11, 1, 0, 0, time.UTC)
assertEqualTime(t, expected, date)
if err != nil {
t.Errorf("err should be nil")
}
}
func Test_ParseLayout4_2(t *testing.T) {
date, err := parseTime("Thu, 9, Jan 2014 10:19")
expected := time.Date(2014, time.January, 9, 10, 19, 0, 0, time.UTC)
assertEqualTime(t, expected, date)
if err != nil {
t.Errorf("err should be nil")
}
}
func Test_ParseLayout5(t *testing.T) {
date, err := parseTime("22 Jul 2013 14:55:01 EST")
expected := time.Date(2013, time.July, 22, 14, 55, 1, 0, time.FixedZone("EST", -18000))
assertEqualTime(t, expected, date)
if err != nil {
t.Errorf("err should be nil")
}
}
func assertEqualTime(t *testing.T, expected, actual time.Time) {
if !expected.Equal(actual) {
t.Errorf("expected %v but was %v", expected, actual)
}
}