diff --git a/README b/README index 6c895d6..0eb549e 100644 --- a/README +++ b/README @@ -1,26 +1,33 @@ - Author: jim teeuwen - Dependencies: go-pkg-xmlx ( http://github.com/jteeuwen/go-pkg-xmlx ) +Author: jim teeuwen +Dependencies: go-pkg-xmlx ( http://github.com/jteeuwen/go-pkg-xmlx ) - This package allows us to fetch Rss and Atom feeds from the internet. - They are parsed into an object tree which is a hybrid of both the RSS and Atom - standards. +This package allows us to fetch Rss and Atom feeds from the internet. +They are parsed into an object tree which is a hybrid of both the RSS and Atom +standards. - Supported feeds are: - - Rss v0.91, 0.92 and 2.0 - - Atom 1.0 +Supported feeds are: +- Rss v0.91, 0.92 and 2.0 +- Atom 1.0 - The package allows us to maintain cache timeout management. This prevents us - from querying the servers for feed updates too often and risk ip bans. Apart - from setting a cache timeout manually, the package also optionally adheres to - the TTL, SkipDays and SkipHours values specied in the feeds themselves. +The package allows us to maintain cache timeout management. This prevents us +from querying the servers for feed updates too often and risk ip bans. Apart +from setting a cache timeout manually, the package also optionally adheres to +the TTL, SkipDays and SkipHours values specied in the feeds themselves. - Note that the TTL, SkipDays and SkipHour fields are only part of the RSS spec. - For Atom feeds, we use the CacheTimeout in the Feed struct. +Note that the TTL, SkipDays and SkipHour fields are only part of the RSS spec. +For Atom feeds, we use the CacheTimeout in the Feed struct. - Because the object structure is a hybrid between both RSS and Atom specs, not - all fields will be filled when requesting either an RSS or Atom feed. I have - tried to create as many shared fields as possible but some of them simply do - not occur in either the RSS or Atom spec. +Because the object structure is a hybrid between both RSS and Atom specs, not +all fields will be filled when requesting either an RSS or Atom feed. I have +tried to create as many shared fields as possible but some of them simply do +not occur in either the RSS or Atom spec. + +The Feed object supports notifications of new channels and items. +This is achieved by passing 2 function handlers to the feeder.New() function. +They will be called whenever a feed is updated from the remote source and +either a new channel or a new item is found that previously did not exist. +This allows you to easily monitor a feed for changes. See src/feed_test.go for +an example of how this works. ================================================================================ LICENSE diff --git a/src/atom.go b/src/atom.go index d7aa967..11336af 100644 --- a/src/atom.go +++ b/src/atom.go @@ -6,14 +6,42 @@ import "xmlx" func (this *Feed) readAtom(doc *xmlx.Document) (err os.Error) { ns := "http://www.w3.org/2005/Atom" channels := doc.SelectNodes(ns, "feed") + + getChan := func(id string) *Channel { + for _, c := range this.Channels { + if c.Id == id { + return c + } + } + return nil + } + + haveItem := func(ch *Channel, id string) bool { + for _, item := range ch.Items { + if item.Id == id { + return true + } + } + return false + } + + var ch *Channel + var i *Item + var tn *xmlx.Node + var list []*xmlx.Node + for _, node := range channels { - ch := Channel{} + if ch = getChan(node.GetValue("", "pubDate")); ch == nil { + ch = new(Channel) + this.Channels = append(this.Channels, ch) + } + ch.Title = node.GetValue(ns, "title") ch.LastBuildDate = node.GetValue(ns, "updated") ch.Id = node.GetValue(ns, "id") ch.Rights = node.GetValue(ns, "rights") - list := node.SelectNodes(ns, "link") + list = node.SelectNodes(ns, "link") ch.Links = make([]Link, len(list)) for i, v := range list { ch.Links[i].Href = v.GetAttr("", "href") @@ -22,8 +50,7 @@ func (this *Feed) readAtom(doc *xmlx.Document) (err os.Error) { ch.Links[i].HrefLang = v.GetAttr("", "hreflang") } - tn := node.SelectNode(ns, "subtitle") - if tn != nil { + if tn = node.SelectNode(ns, "subtitle"); tn != nil { ch.SubTitle = SubTitle{} ch.SubTitle.Type = tn.GetAttr("", "type") ch.SubTitle.Text = tn.Value @@ -43,50 +70,56 @@ func (this *Feed) readAtom(doc *xmlx.Document) (err os.Error) { ch.Author.Email = tn.GetValue("", "email") } + itemcount := len(ch.Items) list = node.SelectNodes(ns, "entry") - ch.Items = make([]Item, len(list)) - for _, v := range list { - item := Item{} - item.Title = v.GetValue(ns, "title") - item.Id = v.GetValue(ns, "id") - item.PubDate = v.GetValue(ns, "updated") - item.Description = v.GetValue(ns, "summary") - list = v.SelectNodes(ns, "link") - item.Links = make([]Link, 0) - for _, lv := range list { + for _, item := range list { + if haveItem(ch, item.GetValue("", "id")) { + continue + } + + i = new(Item) + i.Title = item.GetValue(ns, "title") + i.Id = item.GetValue(ns, "id") + i.PubDate = item.GetValue(ns, "updated") + i.Description = item.GetValue(ns, "summary") + + links := item.SelectNodes(ns, "link") + for _, lv := range links { if tn.GetAttr(ns, "rel") == "enclosure" { - enc := Enclosure{} + enc := new(Enclosure) enc.Url = lv.GetAttr("", "href") enc.Type = lv.GetAttr("", "type") - item.Enclosures = append(item.Enclosures, enc) + i.Enclosures = append(i.Enclosures, enc) } else { - lnk := Link{} + lnk := new(Link) lnk.Href = lv.GetAttr("", "href") lnk.Rel = lv.GetAttr("", "rel") lnk.Type = lv.GetAttr("", "type") lnk.HrefLang = lv.GetAttr("", "hreflang") - item.Links = append(item.Links, lnk) + i.Links = append(i.Links, lnk) } } - list = v.SelectNodes(ns, "contributor") - item.Contributors = make([]string, len(list)) - for ci, cv := range list { - item.Contributors[ci] = cv.GetValue("", "name") + list = item.SelectNodes(ns, "contributor") + for _, cv := range list { + i.Contributors = append(i.Contributors, cv.GetValue("", "name")) } - if tn = v.SelectNode(ns, "content"); tn != nil { - item.Content = Content{} - item.Content.Type = tn.GetAttr("", "type") - item.Content.Lang = tn.GetValue("xml", "lang") - item.Content.Base = tn.GetValue("xml", "base") - item.Content.Text = tn.Value + if tn = item.SelectNode(ns, "content"); tn != nil { + i.Content = new(Content) + i.Content.Type = tn.GetAttr("", "type") + i.Content.Lang = tn.GetValue("xml", "lang") + i.Content.Base = tn.GetValue("xml", "base") + i.Content.Text = tn.Value } - ch.Items = append(ch.Items, item) + + ch.Items = append(ch.Items, i) } - this.Channels = append(this.Channels, ch) + if itemcount != len(ch.Items) && this.itemhandler != nil { + this.itemhandler(this, ch, ch.Items[itemcount:]) + } } return } diff --git a/src/channel.go b/src/channel.go index 5de5b83..f602675 100644 --- a/src/channel.go +++ b/src/channel.go @@ -11,14 +11,14 @@ type Channel struct { PubDate string LastBuildDate string Docs string - Categories []Category + Categories []*Category Generator Generator TTL int Rating string SkipHours []int SkipDays []int Image Image - Items []Item + Items []*Item Cloud Cloud TextInput Input diff --git a/src/feed.go b/src/feed.go index c18990f..2011d8a 100644 --- a/src/feed.go +++ b/src/feed.go @@ -32,6 +32,9 @@ import "fmt" import "strconv" import "strings" +type ChannelHandler func(f *Feed, newchannels []*Channel) +type ItemHandler func(f *Feed, ch *Channel, newitems []*Item) + type Feed struct { // Custom cache timeout in minutes. CacheTimeout int @@ -47,21 +50,31 @@ type Feed struct { Version [2]int // Channels with content. - Channels []Channel + Channels []*Channel // Url from which this feed was created. Url string + // A notification function, used to notify the host when a new channel + // has been found. + chanhandler ChannelHandler + + // A notification function, used to notify the host when a new item + // has been found for a given channel. + itemhandler ItemHandler + // Last time content was fetched. Used in conjunction with CacheTimeout // to ensure we don't get content too often. lastupdate int64 } -func New(cachetimeout int, enforcecachelimit bool) *Feed { +func New(cachetimeout int, enforcecachelimit bool, ch ChannelHandler, ih ItemHandler) *Feed { v := new(Feed) v.CacheTimeout = cachetimeout v.EnforceCacheLimit = enforcecachelimit v.Type = "none" + v.chanhandler = ch + v.itemhandler = ih return v } @@ -71,7 +84,6 @@ func (this *Feed) Fetch(uri string) (err os.Error) { } this.Url = uri - this.Channels = nil // Extract type and version of the feed so we can have the appropriate // function parse it (rss 0.91, rss 0.92, rss 2, atom etc). @@ -86,14 +98,21 @@ func (this *Feed) Fetch(uri string) (err os.Error) { return } + chancount := len(this.Channels) if err = this.buildFeed(doc); err != nil || len(this.Channels) == 0 { return } + // Notify host of new channels + if chancount != len(this.Channels) && this.chanhandler != nil { + this.chanhandler(this, this.Channels[chancount:]) + } + // reset cache timeout values according to feed specified values (TTL) if this.EnforceCacheLimit && this.CacheTimeout < this.Channels[0].TTL { this.CacheTimeout = this.Channels[0].TTL } + return } diff --git a/src/feed_test.go b/src/feed_test.go index 22ae9b1..ae96c09 100644 --- a/src/feed_test.go +++ b/src/feed_test.go @@ -15,10 +15,31 @@ func TestFeed(t *testing.T) { var err os.Error for _, uri := range urilist { - feed = New(5, true) + feed = New(5, true, chanHandler, itemHandler) if err = feed.Fetch(uri); err != nil { t.Errorf("%s >>> %s", uri, err) } } + + /* +Output of handlers: + +6 new item(s) in WriteTheWeb of http://cyber.law.harvard.edu/rss/examples/sampleRss091.xml +1 new channel(s) in http://cyber.law.harvard.edu/rss/examples/sampleRss091.xml +21 new item(s) in Dave Winer: Grateful Dead of http://cyber.law.harvard.edu/rss/examples/sampleRss092.xml +1 new channel(s) in http://cyber.law.harvard.edu/rss/examples/sampleRss092.xml +4 new item(s) in Liftoff News of http://cyber.law.harvard.edu/rss/examples/rss2sample.xml +1 new channel(s) in http://cyber.law.harvard.edu/rss/examples/rss2sample.xml +15 new item(s) in Blog@Case of http://blog.case.edu/news/feed.atom +1 new channel(s) in http://blog.case.edu/news/feed.atom + */ +} + +func chanHandler(feed *Feed, newchannels []*Channel) { + //println(len(newchannels), "new channel(s) in", feed.Url) +} + +func itemHandler(feed *Feed, ch *Channel, newitems []*Item) { + //println(len(newitems), "new item(s) in", ch.Title, "of", feed.Url) } diff --git a/src/item.go b/src/item.go index 046c18f..acfeb34 100644 --- a/src/item.go +++ b/src/item.go @@ -3,19 +3,19 @@ package feeder type Item struct { // RSS and Shared fields Title string - Links []Link + Links []*Link Description string Author Author - Categories []Category + Categories []*Category Comments string - Enclosures []Enclosure + Enclosures []*Enclosure Guid string PubDate string - Source Source + Source *Source // Atom specific fields Id string - Generator Generator + Generator *Generator Contributors []string - Content Content + Content *Content } diff --git a/src/rss.go b/src/rss.go index e4901a0..20af1f9 100644 --- a/src/rss.go +++ b/src/rss.go @@ -4,13 +4,60 @@ import "os" import "xmlx" func (this *Feed) readRss2(doc *xmlx.Document) (err os.Error) { + days := make(map[string]int) + days["Monday"] = 1 + days["Tuesday"] = 2 + days["Wednesday"] = 3 + days["Thursday"] = 4 + days["Friday"] = 5 + days["Saturday"] = 6 + days["Sunday"] = 7 + + getChan := func(pubdate string) *Channel { + for _, c := range this.Channels { + if c.PubDate == pubdate { + return c + } + } + return nil + } + + haveItem := func(ch *Channel, id, title, desc string) bool { + for _, item := range ch.Items { + switch { + case len(id) > 0: + if item.Id == id { + return true + } + case len(title) > 0: + if item.Title == title { + return true + } + default: + if item.Description == desc { + return true + } + } + } + return false + } + + var ch *Channel + var i *Item + var n *xmlx.Node + var list, tl []*xmlx.Node + channels := doc.SelectNodes("", "channel") for _, node := range channels { - ch := Channel{} - ch.Title = node.GetValue("", "title") + if ch = getChan(node.GetValue("", "pubDate")); ch == nil { + ch = new(Channel) + this.Channels = append(this.Channels, ch) + } - list := node.SelectNodes("", "link") + ch.Title = node.GetValue("", "title") + list = node.SelectNodes("", "link") ch.Links = make([]Link, len(list)) + for i, v := range list { ch.Links[i].Href = v.Value } @@ -25,14 +72,14 @@ func (this *Feed) readRss2(doc *xmlx.Document) (err os.Error) { ch.Docs = node.GetValue("", "docs") list = node.SelectNodes("", "category") - ch.Categories = make([]Category, len(list)) + ch.Categories = make([]*Category, len(list)) for i, v := range list { + ch.Categories[i] = new(Category) ch.Categories[i].Domain = v.GetAttr("", "domain") ch.Categories[i].Text = v.Value } - n := node.SelectNode("", "generator") - if n != nil { + if n = node.SelectNode("", "generator"); n != nil { ch.Generator = Generator{} ch.Generator.Text = n.Value } @@ -49,7 +96,7 @@ func (this *Feed) readRss2(doc *xmlx.Document) (err os.Error) { list = node.SelectNodes("", "days") ch.SkipDays = make([]int, len(list)) for i, v := range list { - ch.SkipDays[i] = mapDay(v.Value) + ch.SkipDays[i] = days[v.Value] } if n = node.SelectNode("", "image"); n != nil { @@ -78,16 +125,22 @@ func (this *Feed) readRss2(doc *xmlx.Document) (err os.Error) { ch.TextInput.Link = n.GetValue("", "link") } + itemcount := len(ch.Items) list = node.SelectNodes("", "item") + for _, item := range list { - i := Item{} + if haveItem(ch, item.GetValue("", "pubDate"), + item.GetValue("", "title"), item.GetValue("", "description")) { + continue + } + + i = new(Item) i.Title = item.GetValue("", "title") i.Description = item.GetValue("", "description") - list = node.SelectNodes("", "link") - i.Links = make([]Link, 0) - for _, v := range list { - lnk := Link{} + tl = node.SelectNodes("", "link") + for _, v := range tl { + lnk := new(Link) lnk.Href = v.Value i.Links = append(i.Links, lnk) } @@ -101,24 +154,25 @@ func (this *Feed) readRss2(doc *xmlx.Document) (err os.Error) { i.Guid = item.GetValue("", "guid") i.PubDate = item.GetValue("", "pubDate") - list := item.SelectNodes("", "category") - i.Categories = make([]Category, len(list)) - for li, lv := range list { - i.Categories[li].Domain = lv.GetAttr("", "domain") - i.Categories[li].Text = lv.Value + tl = item.SelectNodes("", "category") + for _, lv := range tl { + cat := new(Category) + cat.Domain = lv.GetAttr("", "domain") + cat.Text = lv.Value + i.Categories = append(i.Categories, cat) } - list = item.SelectNodes("", "enclosure") - i.Enclosures = make([]Enclosure, len(list)) - for li, lv := range list { - i.Enclosures[li].Url = lv.GetAttr("", "url") - i.Enclosures[li].Length = lv.GetAttri64("", "length") - i.Enclosures[li].Type = lv.GetAttr("", "type") + tl = item.SelectNodes("", "enclosure") + for _, lv := range tl { + enc := new(Enclosure) + enc.Url = lv.GetAttr("", "url") + enc.Length = lv.GetAttri64("", "length") + enc.Type = lv.GetAttr("", "type") + i.Enclosures = append(i.Enclosures, enc) } - src := item.SelectNode("", "source") - if src != nil { - i.Source = Source{} + if src := item.SelectNode("", "source"); src != nil { + i.Source = new(Source) i.Source.Url = src.GetAttr("", "url") i.Source.Text = src.Value } @@ -126,27 +180,9 @@ func (this *Feed) readRss2(doc *xmlx.Document) (err os.Error) { ch.Items = append(ch.Items, i) } - this.Channels = append(this.Channels, ch) + if itemcount != len(ch.Items) && this.itemhandler != nil { + this.itemhandler(this, ch, ch.Items[itemcount:]) + } } return } - -func mapDay(day string) int { - switch day { - case "Monday": - return 1 - case "Tuesday": - return 2 - case "Wednesday": - return 3 - case "Thursday": - return 4 - case "Friday": - return 5 - case "Saturday": - return 6 - case "Sunday": - return 7 - } - return 1 -}