From c2601a5ce3795f384e72f82e19afcc9d28bcd46e Mon Sep 17 00:00:00 2001 From: jim teeuwen Date: Mon, 23 Nov 2009 13:46:12 +0100 Subject: [PATCH] new file: README new file: src/Makefile new file: src/atom.go new file: src/author.go new file: src/category.go new file: src/channel.go new file: src/cloud.go new file: src/content.go new file: src/enclosure.go new file: src/feed.go new file: src/feed_test.go new file: src/generator.go new file: src/image.go new file: src/input.go new file: src/item.go new file: src/link.go new file: src/rss.go new file: src/source.go new file: src/subtitle.go --- README | 23 ++++++ src/Makefile | 10 +++ src/atom.go | 97 ++++++++++++++++++++++ src/author.go | 7 ++ src/category.go | 6 ++ src/channel.go | 49 +++++++++++ src/cloud.go | 10 +++ src/content.go | 8 ++ src/enclosure.go | 8 ++ src/feed.go | 205 +++++++++++++++++++++++++++++++++++++++++++++++ src/feed_test.go | 22 +++++ src/generator.go | 8 ++ src/image.go | 10 +++ src/input.go | 8 ++ src/item.go | 39 +++++++++ src/link.go | 9 +++ src/rss.go | 150 ++++++++++++++++++++++++++++++++++ src/source.go | 6 ++ src/subtitle.go | 7 ++ 19 files changed, 682 insertions(+) create mode 100644 README create mode 100644 src/Makefile create mode 100644 src/atom.go create mode 100644 src/author.go create mode 100644 src/category.go create mode 100644 src/channel.go create mode 100644 src/cloud.go create mode 100644 src/content.go create mode 100644 src/enclosure.go create mode 100644 src/feed.go create mode 100644 src/feed_test.go create mode 100644 src/generator.go create mode 100644 src/image.go create mode 100644 src/input.go create mode 100644 src/item.go create mode 100644 src/link.go create mode 100644 src/rss.go create mode 100644 src/source.go create mode 100644 src/subtitle.go diff --git a/README b/README new file mode 100644 index 0000000..8f004a8 --- /dev/null +++ b/README @@ -0,0 +1,23 @@ + Author: jim teeuwen + Dependencies: go-pkg-xmlx (http://github.com/jteeuwen/go-pkg-xmlx) + + This package allows us to fetch Rss and Atom feeds from the internet. + They are parsed into an object tree which is a hyvrid of both the RSS and Atom + standards. + + Supported feeds are: + - Rss v0.91, 0.91 and 2.0 + - Atom 1.0 + + The package allows us to maintain cache timeout management. This prevents us + from querying the servers for feed updates too often and risk ip bams. Appart + from setting a cache timeout manually, the package also optionally adheres to + the TTL, SkipDays and SkipHours values specied in the feeds themselves. + + Note that the TTL, SkipDays and SkipHour fields are only part of the RSS spec. + For Atom feeds, we use the CacheTimeout in the Feed struct. + + Because the object structure is a hybrid between both RSS and Atom specs, not + all fields will be filled when requesting either an RSS or Atom feed. I have + tried to create as many shared fields as possiblem but some of them simply do + not occur in either the RSS or Atom spec. diff --git a/src/Makefile b/src/Makefile new file mode 100644 index 0000000..17bc98a --- /dev/null +++ b/src/Makefile @@ -0,0 +1,10 @@ + +include $(GOROOT)/src/Make.$(GOARCH) + +TARG=feeder +GOFILES=feed.go rss.go atom.go channel.go image.go item.go cloud.go \ + enclosure.go source.go input.go category.go generator.go link.go\ + subtitle.go author.go content.go\ + + +include $(GOROOT)/src/Make.pkg diff --git a/src/atom.go b/src/atom.go new file mode 100644 index 0000000..abbe839 --- /dev/null +++ b/src/atom.go @@ -0,0 +1,97 @@ +package feeder + +import "os" +import "xmlx" + +func (this *Feed) readAtom(doc *xmlx.Document) (err os.Error) { + ns := "http://www.w3.org/2005/Atom"; + channels := doc.SelectNodes(ns, "feed"); + for _, node := range channels { + ch := Channel{}; + ch.Title = node.GetValue(ns, "title"); + ch.LastBuildDate = node.GetValue(ns, "updated"); + ch.Id = node.GetValue(ns, "id"); + ch.Rights = node.GetValue(ns, "rights"); + + list := node.SelectNodes(ns, "link"); + ch.Links = make([]Link, len(list)); + for i, v := range list { + ch.Links[i].Href = v.GetAttr("", "href"); + ch.Links[i].Rel = v.GetAttr("", "rel"); + ch.Links[i].Type = v.GetAttr("", "type"); + ch.Links[i].HrefLang = v.GetAttr("", "hreflang"); + + } + + tn := node.SelectNode(ns, "subtitle"); + if tn != nil { + ch.SubTitle = SubTitle{}; + ch.SubTitle.Type = tn.GetAttr("", "type"); + ch.SubTitle.Text = tn.Value; + } + + tn = node.SelectNode(ns, "generator"); + if tn != nil { + ch.Generator = Generator{}; + ch.Generator.Uri = tn.GetAttr("", "uri"); + ch.Generator.Version = tn.GetAttr("", "version"); + ch.Generator.Text = tn.Value; + } + + tn = node.SelectNode(ns, "author"); + if tn != nil { + ch.Author = Author{}; + ch.Author.Name = tn.GetValue("", "name"); + ch.Author.Uri = tn.GetValue("", "uri"); + ch.Author.Email = tn.GetValue("", "email"); + } + + list = node.SelectNodes(ns, "entry"); + ch.Items = make([]Item, len(list)); + for _, v := range list { + item := Item{}; + item.Title = v.GetValue(ns, "title"); + item.Id = v.GetValue(ns, "id"); + item.PubDate = v.GetValue(ns, "updated"); + item.Description = v.GetValue(ns, "summary"); + + list = v.SelectNodes(ns, "link"); + item.Links = make([]Link, 0); + for _, lv := range list { + if tn.GetAttr(ns, "rel") == "enclosure" { + enc := Enclosure{}; + enc.Url = lv.GetAttr("", "href"); + enc.Type = lv.GetAttr("", "type"); + item.addEnclosure(enc); + } else { + lnk := Link{}; + lnk.Href = lv.GetAttr("", "href"); + lnk.Rel = lv.GetAttr("", "rel"); + lnk.Type = lv.GetAttr("", "type"); + lnk.HrefLang = lv.GetAttr("", "hreflang"); + item.addLink(lnk); + } + } + + list = v.SelectNodes(ns, "contributor"); + item.Contributors = make([]string, len(list)); + for ci, cv := range list { + item.Contributors[ci] = cv.GetValue("", "name"); + } + + tn = v.SelectNode(ns, "content"); + if tn != nil { + item.Content = Content{}; + item.Content.Type = tn.GetAttr("", "type"); + item.Content.Lang = tn.GetValue("xml", "lang"); + item.Content.Base = tn.GetValue("xml", "base"); + item.Content.Text = tn.Value; + } + ch.addItem(item); + } + + this.addChannel(ch); + } + return +} + diff --git a/src/author.go b/src/author.go new file mode 100644 index 0000000..a1aec84 --- /dev/null +++ b/src/author.go @@ -0,0 +1,7 @@ +package feeder + +type Author struct { + Name string; + Uri string; + Email string; +} diff --git a/src/category.go b/src/category.go new file mode 100644 index 0000000..bc1b100 --- /dev/null +++ b/src/category.go @@ -0,0 +1,6 @@ +package feeder + +type Category struct { + Domain string; + Text string; +} diff --git a/src/channel.go b/src/channel.go new file mode 100644 index 0000000..98ca9c5 --- /dev/null +++ b/src/channel.go @@ -0,0 +1,49 @@ +package feeder + +type Channel struct { + Title string; + Links []Link; + Description string; + Language string; + Copyright string; + ManagingEditor string; + WebMaster string; + PubDate string; + LastBuildDate string; + Docs string; + Categories []Category; + Generator Generator; + TTL int; + Rating string; + SkipHours []int; + SkipDays []int; + Image Image; + Items []Item; + Cloud Cloud; + TextInput Input; + + // Atom fields + Id string; + Rights string; + Author Author; + SubTitle SubTitle; +} + +func (this *Channel) addItem(item Item) { + slice := make([]Item, len(this.Items) + 1); + for i,v := range this.Items { + slice[i] = v; + } + slice[len(slice) - 1] = item; + this.Items = slice; +} + + +func (this *Channel) addLink(l Link) { + slice := make([]Link, len(this.Links) + 1); + for i,v := range this.Links { + slice[i] = v; + } + slice[len(slice) - 1] = l; + this.Links = slice; +} diff --git a/src/cloud.go b/src/cloud.go new file mode 100644 index 0000000..b2f0d6e --- /dev/null +++ b/src/cloud.go @@ -0,0 +1,10 @@ +package feeder + +type Cloud struct { + Domain string; + Port int; + Path string; + RegisterProcedure string; + Protocol string; +} + diff --git a/src/content.go b/src/content.go new file mode 100644 index 0000000..edd723f --- /dev/null +++ b/src/content.go @@ -0,0 +1,8 @@ +package feeder + +type Content struct { + Type string; + Lang string; + Base string; + Text string; +} diff --git a/src/enclosure.go b/src/enclosure.go new file mode 100644 index 0000000..913ebfe --- /dev/null +++ b/src/enclosure.go @@ -0,0 +1,8 @@ +package feeder + +type Enclosure struct { + Url string; + Length int64; + Type string; +} + diff --git a/src/feed.go b/src/feed.go new file mode 100644 index 0000000..cc26f0e --- /dev/null +++ b/src/feed.go @@ -0,0 +1,205 @@ +// Copyright 2009 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +/* + Author: jim teeuwen + Dependencies: go-pkg-xmlx (http://github.com/jteeuwen/go-pkg-xmlx) + + This package allows us to fetch Rss and Atom feeds from the internet. + They are parsed into an object tree which is a hyvrid of both the RSS and Atom + standards. + + Supported feeds are: + - Rss v0.91, 0.91 and 2.0 + - Atom 1.0 + + The package allows us to maintain cache timeout management. This prevents us + from querying the servers for feed updates too often and risk ip bams. Appart + from setting a cache timeout manually, the package also optionally adheres to + the TTL, SkipDays and SkipHours values specied in the feeds themselves. + + Note that the TTL, SkipDays and SkipHour fields are only part of the RSS spec. + For Atom feeds, we use the CacheTimeout in the Feed struct. + + Because the object structure is a hybrid between both RSS and Atom specs, not + all fields will be filled when requesting either an RSS or Atom feed. I have + tried to create as many shared fields as possiblem but some of them simply do + not occur in either the RSS or Atom spec. +*/ +package feeder + +import "os" +import "http" +import "io" +import "time" +import "xmlx" +import "fmt" +import "strconv" +import "strings" + +type Feed struct { + // Custom cache timeout in minutes. + CacheTimeout int; + + // Make sure we adhere to the cache timeout specified in the feed. If + // our CacheTimeout is higher than that, we will use that instead. + EnforceCacheLimit bool; + + // Type of feed. Rss, Atom, etc + Type string; + + // Version of the feed. Major and Minor. + Version [2]int; + + // Channels with content. + Channels []Channel; + + // Url from which this feed was created. + Url string; + + // Last time content was fetched. Used in conjunction with CacheTimeout + // to ensure we don't get content too often. + lastupdate int64; +} + +func New(cachetimeout int, enforcecachelimit bool) *Feed { + return &Feed{ + CacheTimeout: cachetimeout, + EnforceCacheLimit: enforcecachelimit, + Type: "none", + Version: [2]int{0, 0}, + Channels: make([]Channel, 0), + } +} + +func (this *Feed) addChannel(ch Channel) { + slice := make([]Channel, len(this.Channels) + 1); + for i,v := range this.Channels { + slice[i] = v; + } + slice[len(slice) - 1] = ch; + this.Channels = slice; +} + +func (this *Feed) Fetch(uri string) (err os.Error) { + if !this.canUpdate() { return } + + // Fetch data from remote location. + r, _, err := http.Get(uri); + if err != nil { return } + + defer r.Body.Close(); + + b, err := io.ReadAll(r.Body); + if err != nil { return } + content := string(b); + + this.Url = uri; + + // Extract type and version of the feed so we can have the appropriate + // function parse it (rss 0.91, rss 0.92, rss 2, atom etc). + doc := xmlx.New(); + err = doc.LoadString(content); + if err != nil { return } + this.Type, this.Version = this.GetVersionInfo(doc); + + ok := this.testVersions(); + if !ok { + err = os.NewError(fmt.Sprintf("Unsupported feed: %s, version: %+v", this.Type, this.Version)); + return; + } + + err = this.buildFeed(doc); + if err != nil || len(this.Channels) == 0 { return } + + // reset cache timeout values according to feed specified values (TTL) + if this.EnforceCacheLimit && this.CacheTimeout < this.Channels[0].TTL { + this.CacheTimeout = this.Channels[0].TTL; + } + return; +} + +func (this *Feed) canUpdate() bool { + // Make sure we are not within the specified cache-limit. + // This ensures we don't request data too often. + utc := time.UTC(); + if utc.Seconds() - this.lastupdate < int64(this.CacheTimeout * 60) { + return false + } + + // If skipDays or skipHours are set in the RSS feed, use these to see if + // we can update. + if len(this.Channels) == 0 && this.Type == "rss" { + if this.EnforceCacheLimit && len(this.Channels[0].SkipDays) > 0 { + for _ ,v := range this.Channels[0].SkipDays { + if v == utc.Weekday { + return false + } + } + } + + if this.EnforceCacheLimit && len(this.Channels[0].SkipHours) > 0 { + for _ ,v := range this.Channels[0].SkipHours { + if v == utc.Hour { + return false + } + } + } + } + + this.lastupdate = utc.Seconds(); + return true +} + +func (this *Feed) buildFeed(doc *xmlx.Document) (err os.Error) { + switch this.Type { + case "rss": err = this.readRss2(doc); + case "atom": err = this.readAtom(doc); + } + return +} + +func (this *Feed) testVersions() bool { + switch this.Type { + case "rss": + if this.Version[0] > 2 || (this.Version[0] == 2 && this.Version[1] > 0) { + return false + } + + case "atom": + if this.Version[0] > 1 || (this.Version[0] == 1 && this.Version[1] > 0) { + return false + } + + default: + return false + } + + return true; +} + +func (this *Feed) GetVersionInfo(doc *xmlx.Document) (ftype string, fversion [2]int) { + node := doc.SelectNode("http://www.w3.org/2005/Atom", "feed"); + if node == nil { goto rss } + ftype = "atom"; + fversion = [2]int{1, 0}; + return; + + rss: + node = doc.SelectNode("", "rss"); + if node == nil { goto end } + ftype = "rss"; + version := node.GetAttr("", "version"); + p := strings.Index(version, "."); + major, _ := strconv.Atoi(version[0:p]); + minor, _ := strconv.Atoi(version[p+1 : len(version)]); + fversion = [2]int{major, minor}; + return; + + end: + ftype = "unknown"; + fversion = [2]int{0, 0}; + return; +} + diff --git a/src/feed_test.go b/src/feed_test.go new file mode 100644 index 0000000..515e4d6 --- /dev/null +++ b/src/feed_test.go @@ -0,0 +1,22 @@ +package feeder + +import "testing" + +func TestFeed(t *testing.T) { + urilist := []string{ + "http://cyber.law.harvard.edu/rss/examples/sampleRss091.xml", + "http://cyber.law.harvard.edu/rss/examples/sampleRss092.xml", + "http://cyber.law.harvard.edu/rss/examples/rss2sample.xml", + "http://blog.case.edu/news/feed.atom", + }; + + for _, uri := range urilist { + feed := New(5, true); + err := feed.Fetch(uri); + if err != nil { + t.Errorf("%s >>> %s", uri, err); + continue; + } + } +} + diff --git a/src/generator.go b/src/generator.go new file mode 100644 index 0000000..7763c7c --- /dev/null +++ b/src/generator.go @@ -0,0 +1,8 @@ +package feeder + +type Generator struct { + Uri string; + Version string; + Text string; +} + diff --git a/src/image.go b/src/image.go new file mode 100644 index 0000000..d052126 --- /dev/null +++ b/src/image.go @@ -0,0 +1,10 @@ +package feeder + +type Image struct { + Title string; + Url string; + Link string; + Width int; + Height int; + Description string; +} diff --git a/src/input.go b/src/input.go new file mode 100644 index 0000000..aed1b21 --- /dev/null +++ b/src/input.go @@ -0,0 +1,8 @@ +package feeder + +type Input struct { + Title string; + Description string; + Name string; + Link string; +} diff --git a/src/item.go b/src/item.go new file mode 100644 index 0000000..2f10a88 --- /dev/null +++ b/src/item.go @@ -0,0 +1,39 @@ +package feeder + +type Item struct { + // RSS and Shared fields + Title string; + Links []Link; + Description string; + Author Author; + Categories []Category; + Comments string; + Enclosures []Enclosure; + Guid string; + PubDate string; + Source Source; + + // Atom specific fields + Id string; + Generator Generator; + Contributors []string; + Content Content; +} + +func (this *Item) addEnclosure(e Enclosure) { + slice := make([]Enclosure, len(this.Enclosures) + 1); + for i,v := range this.Enclosures { + slice[i] = v; + } + slice[len(slice) - 1] = e; + this.Enclosures = slice; +} + +func (this *Item) addLink(l Link) { + slice := make([]Link, len(this.Links) + 1); + for i,v := range this.Links { + slice[i] = v; + } + slice[len(slice) - 1] = l; + this.Links = slice; +} diff --git a/src/link.go b/src/link.go new file mode 100644 index 0000000..32a3eb1 --- /dev/null +++ b/src/link.go @@ -0,0 +1,9 @@ +package feeder + +type Link struct { + Href string; + Rel string; + Type string; + HrefLang string; +} + diff --git a/src/rss.go b/src/rss.go new file mode 100644 index 0000000..ab5d228 --- /dev/null +++ b/src/rss.go @@ -0,0 +1,150 @@ +package feeder + +import "os" +import "xmlx" + +func (this *Feed) readRss2(doc *xmlx.Document) (err os.Error) { + channels := doc.SelectNodes("", "channel"); + for _, node := range channels { + ch := Channel{}; + ch.Title = node.GetValue("", "title"); + + list := node.SelectNodes("", "link"); + ch.Links = make([]Link, len(list)); + for i, v := range list { + ch.Links[i].Href = v.Value; + } + + ch.Description = node.GetValue("", "description"); + ch.Language = node.GetValue("", "language"); + ch.Copyright = node.GetValue("", "copyright"); + ch.ManagingEditor = node.GetValue("", "managingEditor"); + ch.WebMaster = node.GetValue("", "webMaster"); + ch.PubDate = node.GetValue("", "pubDate"); + ch.LastBuildDate = node.GetValue("", "lastBuildDate"); + ch.Docs = node.GetValue("", "docs"); + + list = node.SelectNodes("", "category"); + ch.Categories = make([]Category, len(list)); + for i, v := range list { + ch.Categories[i].Domain = v.GetAttr("", "domain"); + ch.Categories[i].Text = v.Value; + } + + n := node.SelectNode("", "generator"); + if n != nil { + ch.Generator = Generator{}; + ch.Generator.Text = n.Value; + } + + ch.TTL = node.GetValuei("", "ttl"); + ch.Rating = node.GetValue("", "rating"); + + list = node.SelectNodes("", "hour"); + ch.SkipHours = make([]int, len(list)); + for i, v := range list { + ch.SkipHours[i] = int(v.GetValuei("", "hour")); + } + + list = node.SelectNodes("", "days"); + ch.SkipDays = make([]int, len(list)); + for i, v := range list { + ch.SkipDays[i] = mapDay(v.Value); + } + + n = node.SelectNode("", "image"); + if n != nil { + ch.Image.Title = n.GetValue("", "title"); + ch.Image.Url = n.GetValue("", "url"); + ch.Image.Link = n.GetValue("", "link"); + ch.Image.Width = n.GetValuei("", "width"); + ch.Image.Height = n.GetValuei("", "height"); + ch.Image.Description = n.GetValue("", "description"); + } + + n = node.SelectNode("", "cloud"); + if n != nil { + ch.Cloud = Cloud{}; + ch.Cloud.Domain = n.GetAttr("", "domain"); + ch.Cloud.Port = n.GetAttri("", "port"); + ch.Cloud.Path = n.GetAttr("", "path"); + ch.Cloud.RegisterProcedure = n.GetAttr("", "registerProcedure"); + ch.Cloud.Protocol = n.GetAttr("", "protocol"); + } + + n = node.SelectNode("", "textInput"); + if n != nil { + ch.TextInput = Input{}; + ch.TextInput.Title = n.GetValue("", "title"); + ch.TextInput.Description = n.GetValue("", "description"); + ch.TextInput.Name = n.GetValue("", "name"); + ch.TextInput.Link = n.GetValue("", "link"); + } + + list = node.SelectNodes("", "item"); + for _, item := range list { + i := Item{}; + i.Title = item.GetValue("", "title"); + i.Description = item.GetValue("", "description"); + + list = node.SelectNodes("", "link"); + i.Links = make([]Link, 0); + for _, v := range list { + lnk := Link{}; + lnk.Href = v.Value; + i.addLink(lnk); + } + + n = item.SelectNode("", "author"); + if n != nil { + i.Author = Author{}; + i.Author.Name = n.Value; + } + + i.Comments = item.GetValue("", "comments"); + i.Guid = item.GetValue("", "guid"); + i.PubDate = item.GetValue("", "pubDate"); + + list := item.SelectNodes("", "category"); + i.Categories = make([]Category, len(list)); + for li, lv := range list { + i.Categories[li].Domain = lv.GetAttr("", "domain"); + i.Categories[li].Text = lv.Value; + } + + list = item.SelectNodes("", "enclosure"); + i.Enclosures = make([]Enclosure, len(list)); + for li, lv := range list { + i.Enclosures[li].Url = lv.GetAttr("", "url"); + i.Enclosures[li].Length = lv.GetAttri64("", "length"); + i.Enclosures[li].Type = lv.GetAttr("", "type"); + } + + src := item.SelectNode("", "source"); + if src != nil { + i.Source = Source{}; + i.Source.Url = src.GetAttr("", "url"); + i.Source.Text = src.Value; + } + + ch.addItem(i); + } + + this.addChannel(ch); + } + return +} + +func mapDay(day string) int { + switch day { + case "Monday": return 1; + case "Tuesday": return 2; + case "Wednesday": return 3; + case "Thursday": return 4; + case "Friday": return 5; + case "Saturday": return 6; + case "Sunday": return 7; + } + return 1; +} + diff --git a/src/source.go b/src/source.go new file mode 100644 index 0000000..1dc45cb --- /dev/null +++ b/src/source.go @@ -0,0 +1,6 @@ +package feeder + +type Source struct { + Url string; + Text string; +} diff --git a/src/subtitle.go b/src/subtitle.go new file mode 100644 index 0000000..5fa9ef9 --- /dev/null +++ b/src/subtitle.go @@ -0,0 +1,7 @@ +package feeder + +type SubTitle struct { + Type string; + Text string; +} +