From 89307010adab9c1b0c8b963f50898ee449bbe42f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20Guillot?= Date: Mon, 20 Nov 2017 18:34:11 -0800 Subject: [PATCH] Add parser for RDF feeds --- locale/translations.go | 13 +- locale/translations/fr_FR.json | 9 +- reader/feed/atom/parser.go | 4 +- reader/feed/parser.go | 31 ++-- reader/feed/parser_test.go | 48 +++++- reader/feed/rdf/parser.go | 28 +++ reader/feed/rdf/parser_test.go | 307 +++++++++++++++++++++++++++++++++ reader/feed/rdf/rdf.go | 71 ++++++++ reader/feed/rss/parser.go | 2 +- reader/opml/parser.go | 2 +- server/static/bin.go | 2 +- server/static/css.go | 2 +- server/static/js.go | 2 +- server/template/common.go | 2 +- server/template/views.go | 2 +- sql/sql.go | 2 +- 16 files changed, 491 insertions(+), 36 deletions(-) create mode 100644 reader/feed/rdf/parser.go create mode 100644 reader/feed/rdf/parser_test.go create mode 100644 reader/feed/rdf/rdf.go diff --git a/locale/translations.go b/locale/translations.go index 298d0e02..ad993bc7 100644 --- a/locale/translations.go +++ b/locale/translations.go @@ -1,5 +1,5 @@ // Code generated by go generate; DO NOT EDIT. -// 2017-11-20 17:09:36.257679981 -0800 PST m=+0.024050336 +// 2017-11-20 18:31:16.993089344 -0800 PST m=+0.032236726 package locale @@ -126,10 +126,11 @@ var Translations = map[string]string{ "Unable to execute request: %v": "Impossible d'exécuter cette requête: %v", "Last Parsing Error": "Dernière erreur d'analyse", "There is a problem with this feed": "Il y a un problème avec cet abonnement", - "Unable to parse OPML file: %v": "Impossible de lire le fichier OPML : %v", - "Unable to parse RSS feed: %v": "Impossible de lire ce flux RSS: %v", - "Unable to parse Atom feed: %v": "Impossible de lire ce flux Atom: %v", - "Unable to parse JSON feed: %v": "Impossible de lire ce flux Json: %v", + "Unable to parse OPML file: %v.": "Impossible de lire ce fichier OPML : %v.", + "Unable to parse RSS feed: %v.": "Impossible de lire ce flux RSS: %v.", + "Unable to parse Atom feed: %v.": "Impossible de lire ce flux Atom: %v.", + "Unable to parse JSON feed: %v.": "Impossible de lire ce flux JSON: %v.", + "Unable to parse RDF feed: %v.": "Impossible de lire ce flux RDF: %v.", "Unable to normalize encoding: %v.": "Impossible de normaliser l'encodage : %v." } `, @@ -137,5 +138,5 @@ var Translations = map[string]string{ var TranslationsChecksums = map[string]string{ "en_US": "6fe95384260941e8a5a3c695a655a932e0a8a6a572c1e45cb2b1ae8baa01b897", - "fr_FR": "0ff93081d867ab27a190b5cbe6aaed65dbdcd80079ad667b515428a147cb20ee", + "fr_FR": "946d1c30bcb862ef35741786cdb5768900ad0d704e802472e481540f9b6542e5", } diff --git a/locale/translations/fr_FR.json b/locale/translations/fr_FR.json index 40aa51af..b10f0fc0 100644 --- a/locale/translations/fr_FR.json +++ b/locale/translations/fr_FR.json @@ -110,9 +110,10 @@ "Unable to execute request: %v": "Impossible d'exécuter cette requête: %v", "Last Parsing Error": "Dernière erreur d'analyse", "There is a problem with this feed": "Il y a un problème avec cet abonnement", - "Unable to parse OPML file: %v": "Impossible de lire le fichier OPML : %v", - "Unable to parse RSS feed: %v": "Impossible de lire ce flux RSS: %v", - "Unable to parse Atom feed: %v": "Impossible de lire ce flux Atom: %v", - "Unable to parse JSON feed: %v": "Impossible de lire ce flux Json: %v", + "Unable to parse OPML file: %v.": "Impossible de lire ce fichier OPML : %v.", + "Unable to parse RSS feed: %v.": "Impossible de lire ce flux RSS: %v.", + "Unable to parse Atom feed: %v.": "Impossible de lire ce flux Atom: %v.", + "Unable to parse JSON feed: %v.": "Impossible de lire ce flux JSON: %v.", + "Unable to parse RDF feed: %v.": "Impossible de lire ce flux RDF: %v.", "Unable to normalize encoding: %v.": "Impossible de normaliser l'encodage : %v." } diff --git a/reader/feed/atom/parser.go b/reader/feed/atom/parser.go index cb21c041..ec0d6b40 100644 --- a/reader/feed/atom/parser.go +++ b/reader/feed/atom/parser.go @@ -14,7 +14,7 @@ import ( "golang.org/x/net/html/charset" ) -// Parse returns a normalized feed struct. +// Parse returns a normalized feed struct from a Atom feed. func Parse(data io.Reader) (*model.Feed, error) { atomFeed := new(AtomFeed) decoder := xml.NewDecoder(data) @@ -22,7 +22,7 @@ func Parse(data io.Reader) (*model.Feed, error) { err := decoder.Decode(atomFeed) if err != nil { - return nil, errors.NewLocalizedError("Unable to parse Atom feed: %v", err) + return nil, errors.NewLocalizedError("Unable to parse Atom feed: %v.", err) } return atomFeed.Transform(), nil diff --git a/reader/feed/parser.go b/reader/feed/parser.go index 8df6b46d..d94d72b8 100644 --- a/reader/feed/parser.go +++ b/reader/feed/parser.go @@ -8,25 +8,30 @@ import ( "bytes" "encoding/xml" "errors" - "github.com/miniflux/miniflux2/helper" - "github.com/miniflux/miniflux2/model" - "github.com/miniflux/miniflux2/reader/feed/atom" - "github.com/miniflux/miniflux2/reader/feed/json" - "github.com/miniflux/miniflux2/reader/feed/rss" "io" "strings" "time" + "github.com/miniflux/miniflux2/helper" + "github.com/miniflux/miniflux2/model" + "github.com/miniflux/miniflux2/reader/feed/atom" + "github.com/miniflux/miniflux2/reader/feed/json" + "github.com/miniflux/miniflux2/reader/feed/rdf" + "github.com/miniflux/miniflux2/reader/feed/rss" + "golang.org/x/net/html/charset" ) +// List of feed formats. const ( - FormatRss = "rss" + FormatRDF = "rdf" + FormatRSS = "rss" FormatAtom = "atom" - FormatJson = "json" + FormatJSON = "json" FormatUnknown = "unknown" ) +// DetectFeedFormat detect feed format from input data. func DetectFeedFormat(data io.Reader) string { defer helper.ExecutionTime(time.Now(), "[Feed:DetectFeedFormat]") @@ -45,15 +50,17 @@ func DetectFeedFormat(data io.Reader) string { if element, ok := token.(xml.StartElement); ok { switch element.Name.Local { case "rss": - return FormatRss + return FormatRSS case "feed": return FormatAtom + case "RDF": + return FormatRDF } } } if strings.HasPrefix(strings.TrimSpace(buffer.String()), "{") { - return FormatJson + return FormatJSON } return FormatUnknown @@ -72,10 +79,12 @@ func parseFeed(data io.Reader) (*model.Feed, error) { switch format { case FormatAtom: return atom.Parse(reader) - case FormatRss: + case FormatRSS: return rss.Parse(reader) - case FormatJson: + case FormatJSON: return json.Parse(reader) + case FormatRDF: + return rdf.Parse(reader) default: return nil, errors.New("Unsupported feed format") } diff --git a/reader/feed/parser_test.go b/reader/feed/parser_test.go index 0dd8dd68..b201cafc 100644 --- a/reader/feed/parser_test.go +++ b/reader/feed/parser_test.go @@ -9,12 +9,21 @@ import ( "testing" ) +func TestDetectRDF(t *testing.T) { + data := `` + format := DetectFeedFormat(bytes.NewBufferString(data)) + + if format != FormatRDF { + t.Errorf("Wrong format detected: %s instead of %s", format, FormatRDF) + } +} + func TestDetectRSS(t *testing.T) { data := `` format := DetectFeedFormat(bytes.NewBufferString(data)) - if format != FormatRss { - t.Errorf("Wrong format detected: %s instead of %s", format, FormatRss) + if format != FormatRSS { + t.Errorf("Wrong format detected: %s instead of %s", format, FormatRSS) } } @@ -45,8 +54,8 @@ func TestDetectJSON(t *testing.T) { ` format := DetectFeedFormat(bytes.NewBufferString(data)) - if format != FormatJson { - t.Errorf("Wrong format detected: %s instead of %s", format, FormatJson) + if format != FormatJSON { + t.Errorf("Wrong format detected: %s instead of %s", format, FormatJSON) } } @@ -93,7 +102,7 @@ func TestParseAtom(t *testing.T) { } } -func TestParseRss(t *testing.T) { +func TestParseRSS(t *testing.T) { data := ` @@ -119,6 +128,35 @@ func TestParseRss(t *testing.T) { } } +func TestParseRDF(t *testing.T) { + data := ` + + + + RDF Example + http://example.org/ + + + + Title + http://example.org/item + Test + + ` + + feed, err := parseFeed(bytes.NewBufferString(data)) + if err != nil { + t.Error(err) + } + + if feed.Title != "RDF Example" { + t.Errorf("Incorrect title, got: %s", feed.Title) + } +} + func TestParseJson(t *testing.T) { data := `{ "version": "https://jsonfeed.org/version/1", diff --git a/reader/feed/rdf/parser.go b/reader/feed/rdf/parser.go new file mode 100644 index 00000000..f854a97b --- /dev/null +++ b/reader/feed/rdf/parser.go @@ -0,0 +1,28 @@ +// Copyright 2017 Frédéric Guillot. All rights reserved. +// Use of this source code is governed by the Apache 2.0 +// license that can be found in the LICENSE file. + +package rdf + +import ( + "encoding/xml" + "io" + + "github.com/miniflux/miniflux2/errors" + "github.com/miniflux/miniflux2/model" + "golang.org/x/net/html/charset" +) + +// Parse returns a normalized feed struct from a RDF feed. +func Parse(data io.Reader) (*model.Feed, error) { + feed := new(rdfFeed) + decoder := xml.NewDecoder(data) + decoder.CharsetReader = charset.NewReaderLabel + + err := decoder.Decode(feed) + if err != nil { + return nil, errors.NewLocalizedError("Unable to parse RDF feed: %v.", err) + } + + return feed.Transform(), nil +} diff --git a/reader/feed/rdf/parser_test.go b/reader/feed/rdf/parser_test.go new file mode 100644 index 00000000..dadca6f0 --- /dev/null +++ b/reader/feed/rdf/parser_test.go @@ -0,0 +1,307 @@ +// Copyright 2017 Frédéric Guillot. All rights reserved. +// Use of this source code is governed by the Apache 2.0 +// license that can be found in the LICENSE file. + +package rdf + +import ( + "bytes" + "strings" + "testing" + + "github.com/miniflux/miniflux2/errors" +) + +func TestParseRDFSample(t *testing.T) { + data := ` + + + + + + XML.com + http://xml.com/pub + + XML.com features a rich mix of information and services + for the XML community. + + + + + + + + + + + + + + + + + XML.com + http://www.xml.com + http://xml.com/universal/images/xml_tiny.gif + + + + Processing Inclusions with XSLT + http://xml.com/pub/2000/08/09/xslt/xslt.html + + Processing document inclusions with general XML tools can be + problematic. This article proposes a way of preserving inclusion + information through SAX-based processing. + + + + + Putting RDF to Work + http://xml.com/pub/2000/08/09/rdfdb/index.html + + Tool and API support for the Resource Description Framework + is slowly coming of age. Edd Dumbill takes a look at RDFDB, + one of the most exciting new RDF toolkits. + + + + + Search XML.com + Search XML.com's XML collection + s + http://search.xml.com + + + ` + + feed, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Error(err) + } + + if feed.Title != "XML.com" { + t.Errorf("Incorrect title, got: %s", feed.Title) + } + + if feed.FeedURL != "" { + t.Errorf("Incorrect feed URL, got: %s", feed.FeedURL) + } + + if feed.SiteURL != "http://xml.com/pub" { + t.Errorf("Incorrect site URL, got: %s", feed.SiteURL) + } + + if len(feed.Entries) != 2 { + t.Errorf("Incorrect number of entries, got: %d", len(feed.Entries)) + } + + if feed.Entries[1].Hash != "8aaeee5d3ab50351422fbded41078ee88c73bf1441085b16a8c09fd90a7db321" { + t.Errorf("Incorrect entry hash, got: %s", feed.Entries[0].Hash) + } + + if feed.Entries[1].URL != "http://xml.com/pub/2000/08/09/rdfdb/index.html" { + t.Errorf("Incorrect entry URL, got: %s", feed.Entries[0].URL) + } + + if feed.Entries[1].Title != "Putting RDF to Work" { + t.Errorf("Incorrect entry title, got: %s", feed.Entries[0].Title) + } + + if strings.HasSuffix(feed.Entries[1].Content, "Tool and API support") { + t.Errorf("Incorrect entry content, got: %s", feed.Entries[0].Content) + } +} + +func TestParseRDFSampleWithDublinCore(t *testing.T) { + data := ` + + + + + Meerkat + http://meerkat.oreillynet.com + Meerkat: An Open Wire Service + The O'Reilly Network + Rael Dornfest (mailto:rael@oreilly.com) + Copyright © 2000 O'Reilly & Associates, Inc. + 2000-01-01T12:00+00:00 + hourly + 2 + 2000-01-01T12:00+00:00 + + + + + + + + + + + + + + + Meerkat Powered! + http://meerkat.oreillynet.com/icons/meerkat-powered.jpg + http://meerkat.oreillynet.com + + + + XML: A Disruptive Technology + http://c.moreover.com/click/here.pl?r123 + + XML is placing increasingly heavy loads on the existing technical + infrastructure of the Internet. + + The O'Reilly Network + Simon St.Laurent (mailto:simonstl@simonstl.com) + Copyright © 2000 O'Reilly & Associates, Inc. + XML + XML.com + NASDAQ + XML + + + + Search Meerkat + Search Meerkat's RSS Database... + s + http://meerkat.oreillynet.com/ + search + regex + + + ` + + feed, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Error(err) + } + + if feed.Title != "Meerkat" { + t.Errorf("Incorrect title, got: %s", feed.Title) + } + + if feed.FeedURL != "" { + t.Errorf("Incorrect feed URL, got: %s", feed.FeedURL) + } + + if feed.SiteURL != "http://meerkat.oreillynet.com" { + t.Errorf("Incorrect site URL, got: %s", feed.SiteURL) + } + + if len(feed.Entries) != 1 { + t.Errorf("Incorrect number of entries, got: %d", len(feed.Entries)) + } + + if feed.Entries[0].Hash != "fa4ef7c300b175ca66f92f226b5dba5caa2a9619f031101bf56e5b884b02cd97" { + t.Errorf("Incorrect entry hash, got: %s", feed.Entries[0].Hash) + } + + if feed.Entries[0].URL != "http://c.moreover.com/click/here.pl?r123" { + t.Errorf("Incorrect entry URL, got: %s", feed.Entries[0].URL) + } + + if feed.Entries[0].Title != "XML: A Disruptive Technology" { + t.Errorf("Incorrect entry title, got: %s", feed.Entries[0].Title) + } + + if strings.HasSuffix(feed.Entries[0].Content, "XML is placing increasingly") { + t.Errorf("Incorrect entry content, got: %s", feed.Entries[0].Content) + } + + if feed.Entries[0].Author != "Simon St.Laurent (mailto:simonstl@simonstl.com)" { + t.Errorf("Incorrect entry author, got: %s", feed.Entries[0].Author) + } +} + +func TestParseItemWithOnlyFeedAuthor(t *testing.T) { + data := ` + + + + + Meerkat + http://meerkat.oreillynet.com + Rael Dornfest (mailto:rael@oreilly.com) + + + + XML: A Disruptive Technology + http://c.moreover.com/click/here.pl?r123 + + XML is placing increasingly heavy loads on the existing technical + infrastructure of the Internet. + + + ` + + feed, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Error(err) + } + + if feed.Entries[0].Author != "Rael Dornfest (mailto:rael@oreilly.com)" { + t.Errorf("Incorrect entry author, got: %s", feed.Entries[0].Author) + } +} + +func TestParseItemWithoutLink(t *testing.T) { + data := ` + + + + + Meerkat + http://meerkat.oreillynet.com + + + + Title + Test + + ` + + feed, err := Parse(bytes.NewBufferString(data)) + if err != nil { + t.Error(err) + } + + if feed.Entries[0].Hash != "37f5223ebd58639aa62a49afbb61df960efb7dc5db5181dfb3cedd9a49ad34c6" { + t.Errorf("Incorrect entry hash, got: %s", feed.Entries[0].Hash) + } + + if feed.Entries[0].URL != "http://meerkat.oreillynet.com" { + t.Errorf("Incorrect entry url, got: %s", feed.Entries[0].URL) + } +} + +func TestParseInvalidXml(t *testing.T) { + data := `garbage` + _, err := Parse(bytes.NewBufferString(data)) + if err == nil { + t.Error("Parse should returns an error") + } + + if _, ok := err.(errors.LocalizedError); !ok { + t.Error("The error returned must be a LocalizedError") + } +} diff --git a/reader/feed/rdf/rdf.go b/reader/feed/rdf/rdf.go new file mode 100644 index 00000000..05281ca0 --- /dev/null +++ b/reader/feed/rdf/rdf.go @@ -0,0 +1,71 @@ +// Copyright 2017 Frédéric Guillot. All rights reserved. +// Use of this source code is governed by the Apache 2.0 +// license that can be found in the LICENSE file. + +package rdf + +import ( + "encoding/xml" + + "github.com/miniflux/miniflux2/helper" + "github.com/miniflux/miniflux2/reader/processor" + "github.com/miniflux/miniflux2/reader/sanitizer" + + "github.com/miniflux/miniflux2/model" +) + +type rdfFeed struct { + XMLName xml.Name `xml:"RDF"` + Title string `xml:"channel>title"` + Link string `xml:"channel>link"` + Creator string `xml:"channel>creator"` + Items []rdfItem `xml:"item"` +} + +func (r *rdfFeed) Transform() *model.Feed { + feed := new(model.Feed) + feed.Title = sanitizer.StripTags(r.Title) + feed.SiteURL = r.Link + + for _, item := range r.Items { + entry := item.Transform() + + if entry.Author == "" && r.Creator != "" { + entry.Author = sanitizer.StripTags(r.Creator) + } + + if entry.URL == "" { + entry.URL = feed.SiteURL + } + + feed.Entries = append(feed.Entries, entry) + } + + return feed +} + +type rdfItem struct { + Title string `xml:"title"` + Link string `xml:"link"` + Description string `xml:"description"` + Creator string `xml:"creator"` +} + +func (r *rdfItem) Transform() *model.Entry { + entry := new(model.Entry) + entry.Title = sanitizer.StripTags(r.Title) + entry.Author = sanitizer.StripTags(r.Creator) + entry.URL = r.Link + entry.Content = processor.ItemContentProcessor(entry.URL, r.Description) + entry.Hash = getHash(r) + return entry +} + +func getHash(r *rdfItem) string { + value := r.Link + if value == "" { + value = r.Title + r.Description + } + + return helper.Hash(value) +} diff --git a/reader/feed/rss/parser.go b/reader/feed/rss/parser.go index 4eceb455..a5b4434c 100644 --- a/reader/feed/rss/parser.go +++ b/reader/feed/rss/parser.go @@ -22,7 +22,7 @@ func Parse(data io.Reader) (*model.Feed, error) { err := decoder.Decode(feed) if err != nil { - return nil, errors.NewLocalizedError("Unable to parse RSS feed: %v", err) + return nil, errors.NewLocalizedError("Unable to parse RSS feed: %v.", err) } return feed.Transform(), nil diff --git a/reader/opml/parser.go b/reader/opml/parser.go index 22706413..02a6dbae 100644 --- a/reader/opml/parser.go +++ b/reader/opml/parser.go @@ -20,7 +20,7 @@ func Parse(data io.Reader) (SubcriptionList, error) { err := decoder.Decode(opml) if err != nil { - return nil, errors.NewLocalizedError("Unable to parse OPML file: %v", err) + return nil, errors.NewLocalizedError("Unable to parse OPML file: %v.", err) } return opml.Transform(), nil diff --git a/server/static/bin.go b/server/static/bin.go index b95023b7..0fef1c47 100644 --- a/server/static/bin.go +++ b/server/static/bin.go @@ -1,5 +1,5 @@ // Code generated by go generate; DO NOT EDIT. -// 2017-11-20 17:09:36.239163817 -0800 PST m=+0.005534172 +// 2017-11-20 18:31:16.964945842 -0800 PST m=+0.004093224 package static diff --git a/server/static/css.go b/server/static/css.go index bfe6b6a6..d9d51266 100644 --- a/server/static/css.go +++ b/server/static/css.go @@ -1,5 +1,5 @@ // Code generated by go generate; DO NOT EDIT. -// 2017-11-20 17:09:36.24112331 -0800 PST m=+0.007493665 +// 2017-11-20 18:31:16.967667594 -0800 PST m=+0.006814976 package static diff --git a/server/static/js.go b/server/static/js.go index 816728e9..df65da98 100644 --- a/server/static/js.go +++ b/server/static/js.go @@ -1,5 +1,5 @@ // Code generated by go generate; DO NOT EDIT. -// 2017-11-20 17:09:36.242888415 -0800 PST m=+0.009258770 +// 2017-11-20 18:31:16.972315949 -0800 PST m=+0.011463331 package static diff --git a/server/template/common.go b/server/template/common.go index 16db6eae..d2258db9 100644 --- a/server/template/common.go +++ b/server/template/common.go @@ -1,5 +1,5 @@ // Code generated by go generate; DO NOT EDIT. -// 2017-11-20 17:09:36.256513528 -0800 PST m=+0.022883883 +// 2017-11-20 18:31:16.991583598 -0800 PST m=+0.030730980 package template diff --git a/server/template/views.go b/server/template/views.go index 8b224b61..f2246460 100644 --- a/server/template/views.go +++ b/server/template/views.go @@ -1,5 +1,5 @@ // Code generated by go generate; DO NOT EDIT. -// 2017-11-20 17:09:36.24386504 -0800 PST m=+0.010235395 +// 2017-11-20 18:31:16.974386894 -0800 PST m=+0.013534276 package template diff --git a/sql/sql.go b/sql/sql.go index 5f327c87..a773e57a 100644 --- a/sql/sql.go +++ b/sql/sql.go @@ -1,5 +1,5 @@ // Code generated by go generate; DO NOT EDIT. -// 2017-11-20 17:09:36.23789781 -0800 PST m=+0.004268165 +// 2017-11-20 18:31:16.963285699 -0800 PST m=+0.002433081 package sql