From 9a637ce95e05459adc4712027e6a07eaabcfe657 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20Guillot?= Date: Mon, 11 Mar 2024 20:43:14 -0700 Subject: [PATCH] Refactor RSS parser to use default namespace This change avoid some limitations of the Go XML parser regarding XML namespaces --- internal/reader/media/media.go | 1 + internal/reader/rss/atom.go | 43 ++++++ internal/reader/rss/parser.go | 4 +- internal/reader/rss/parser_test.go | 69 +++++----- internal/reader/rss/podcast.go | 42 ++++-- internal/reader/rss/rss.go | 207 +++++++++++------------------ 6 files changed, 185 insertions(+), 181 deletions(-) create mode 100644 internal/reader/rss/atom.go diff --git a/internal/reader/media/media.go b/internal/reader/media/media.go index 4d9c3661..df84bf03 100644 --- a/internal/reader/media/media.go +++ b/internal/reader/media/media.go @@ -12,6 +12,7 @@ import ( var textLinkRegex = regexp.MustCompile(`(?mi)(\bhttps?:\/\/[-A-Z0-9+&@#\/%?=~_|!:,.;]*[-A-Z0-9+&@#\/%=~_|])`) // Element represents XML media elements. +// Specs: https://www.rssboard.org/media-rss type Element struct { MediaGroups []Group `xml:"http://search.yahoo.com/mrss/ group"` MediaContents []Content `xml:"http://search.yahoo.com/mrss/ content"` diff --git a/internal/reader/rss/atom.go b/internal/reader/rss/atom.go new file mode 100644 index 00000000..e0d66910 --- /dev/null +++ b/internal/reader/rss/atom.go @@ -0,0 +1,43 @@ +// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +package rss // import "miniflux.app/v2/internal/reader/rss" + +import "strings" + +type AtomAuthor struct { + Author AtomPerson `xml:"http://www.w3.org/2005/Atom author"` +} + +func (a *AtomAuthor) String() string { + return a.Author.String() +} + +type AtomPerson struct { + Name string `xml:"name"` + Email string `xml:"email"` +} + +func (a *AtomPerson) String() string { + var name string + + switch { + case a.Name != "": + name = a.Name + case a.Email != "": + name = a.Email + } + + return strings.TrimSpace(name) +} + +type AtomLink struct { + URL string `xml:"href,attr"` + Type string `xml:"type,attr"` + Rel string `xml:"rel,attr"` + Length string `xml:"length,attr"` +} + +type AtomLinks struct { + Links []*AtomLink `xml:"http://www.w3.org/2005/Atom link"` +} diff --git a/internal/reader/rss/parser.go b/internal/reader/rss/parser.go index a8390dc6..55122ea4 100644 --- a/internal/reader/rss/parser.go +++ b/internal/reader/rss/parser.go @@ -14,7 +14,9 @@ import ( // Parse returns a normalized feed struct from a RSS feed. func Parse(baseURL string, data io.ReadSeeker) (*model.Feed, error) { feed := new(rssFeed) - if err := xml.NewXMLDecoder(data).Decode(feed); err != nil { + decoder := xml.NewXMLDecoder(data) + decoder.DefaultSpace = "rss" + if err := decoder.Decode(feed); err != nil { return nil, fmt.Errorf("rss: unable to parse feed: %w", err) } return feed.Transform(baseURL), nil diff --git a/internal/reader/rss/parser_test.go b/internal/reader/rss/parser_test.go index b3a46719..a8fbc76f 100644 --- a/internal/reader/rss/parser_test.go +++ b/internal/reader/rss/parser_test.go @@ -300,7 +300,7 @@ func TestParseEntryWithMultipleAtomLinks(t *testing.T) { Test - + ` @@ -430,7 +430,7 @@ func TestParseEntryWithAuthorAndCDATA(t *testing.T) { Test https://example.org/item - by + @@ -447,38 +447,6 @@ func TestParseEntryWithAuthorAndCDATA(t *testing.T) { } } -func TestParseEntryWithNonStandardAtomAuthor(t *testing.T) { - data := ` - - - Example - https://example.org/ - - - Test - https://example.org/item - - Foo Bar - Vice President - - FooBar Inc. - - - - ` - - feed, err := Parse("https://example.org/", bytes.NewReader([]byte(data))) - if err != nil { - t.Fatal(err) - } - - expected := "Foo Bar" - result := feed.Entries[0].Author - if result != expected { - t.Errorf("Incorrect entry author, got %q instead of %q", result, expected) - } -} - func TestParseEntryWithAtomAuthorEmail(t *testing.T) { data := ` @@ -508,7 +476,7 @@ func TestParseEntryWithAtomAuthorEmail(t *testing.T) { } } -func TestParseEntryWithAtomAuthor(t *testing.T) { +func TestParseEntryWithAtomAuthorName(t *testing.T) { data := ` @@ -1435,6 +1403,37 @@ func TestEntryDescriptionFromGooglePlayDescription(t *testing.T) { } } +func TestParseEntryWithRSSDescriptionAndMediaDescription(t *testing.T) { + data := ` + + + Podcast Example + http://www.example.com/index.html + + Entry Title + http://www.example.com/entries/1 + Entry Description + Media Description + + + ` + + feed, err := Parse("https://example.org/", bytes.NewReader([]byte(data))) + if err != nil { + t.Fatal(err) + } + + if len(feed.Entries) != 1 { + t.Errorf("Incorrect number of entries, got: %d", len(feed.Entries)) + } + + expected := "Entry Description" + result := feed.Entries[0].Content + if expected != result { + t.Errorf(`Unexpected description, got %q instead of %q`, result, expected) + } +} + func TestParseEntryWithCategoryAndInnerHTML(t *testing.T) { data := ` diff --git a/internal/reader/rss/podcast.go b/internal/reader/rss/podcast.go index b72426cc..867bc03b 100644 --- a/internal/reader/rss/podcast.go +++ b/internal/reader/rss/podcast.go @@ -15,21 +15,24 @@ var ErrInvalidDurationFormat = errors.New("rss: invalid duration format") // PodcastFeedElement represents iTunes and GooglePlay feed XML elements. // Specs: // - https://github.com/simplepie/simplepie-ng/wiki/Spec:-iTunes-Podcast-RSS -// - https://developers.google.com/search/reference/podcast/rss-feed +// - https://support.google.com/podcast-publishers/answer/9889544 type PodcastFeedElement struct { - ItunesAuthor string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd channel>author"` - Subtitle string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd channel>subtitle"` - Summary string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd channel>summary"` - PodcastOwner PodcastOwner `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd channel>owner"` - GooglePlayAuthor string `xml:"http://www.google.com/schemas/play-podcasts/1.0 channel>author"` + ItunesAuthor string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd author"` + Subtitle string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd subtitle"` + Summary string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd summary"` + PodcastOwner PodcastOwner `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd owner"` + GooglePlayAuthor string `xml:"http://www.google.com/schemas/play-podcasts/1.0 author"` } // PodcastEntryElement represents iTunes and GooglePlay entry XML elements. type PodcastEntryElement struct { - Subtitle string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd subtitle"` - Summary string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd summary"` - Duration string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd duration"` - GooglePlayDescription string `xml:"http://www.google.com/schemas/play-podcasts/1.0 description"` + ItunesAuthor string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd author"` + Subtitle string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd subtitle"` + Summary string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd summary"` + Duration string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd duration"` + PodcastOwner PodcastOwner `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd owner"` + GooglePlayAuthor string `xml:"http://www.google.com/schemas/play-podcasts/1.0 author"` + GooglePlayDescription string `xml:"http://www.google.com/schemas/play-podcasts/1.0 description"` } // PodcastOwner represents contact information for the podcast owner. @@ -38,6 +41,19 @@ type PodcastOwner struct { Email string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd email"` } +func (p *PodcastOwner) String() string { + var name string + + switch { + case p.Name != "": + name = p.Name + case p.Email != "": + name = p.Email + } + + return strings.TrimSpace(name) +} + // Image represents podcast artwork. type Image struct { URL string `xml:"href,attr"` @@ -52,10 +68,8 @@ func (e *PodcastFeedElement) PodcastAuthor() string { author = e.ItunesAuthor case e.GooglePlayAuthor != "": author = e.GooglePlayAuthor - case e.PodcastOwner.Name != "": - author = e.PodcastOwner.Name - case e.PodcastOwner.Email != "": - author = e.PodcastOwner.Email + case e.PodcastOwner.String() != "": + author = e.PodcastOwner.String() } return strings.TrimSpace(author) diff --git a/internal/reader/rss/rss.go b/internal/reader/rss/rss.go index 963b2d10..cb769141 100644 --- a/internal/reader/rss/rss.go +++ b/internal/reader/rss/rss.go @@ -21,20 +21,25 @@ import ( "miniflux.app/v2/internal/urllib" ) -// Specs: https://cyber.harvard.edu/rss/rss.html +// Specs: https://www.rssboard.org/rss-specification type rssFeed struct { - XMLName xml.Name `xml:"rss"` - Version string `xml:"version,attr"` - Title string `xml:"channel>title"` - Links []rssLink `xml:"channel>link"` - ImageURL string `xml:"channel>image>url"` - Language string `xml:"channel>language"` - Description string `xml:"channel>description"` - PubDate string `xml:"channel>pubDate"` - ManagingEditor string `xml:"channel>managingEditor"` - Webmaster string `xml:"channel>webMaster"` - TimeToLive rssTTL `xml:"channel>ttl"` - Items []rssItem `xml:"channel>item"` + XMLName xml.Name `xml:"rss"` + Version string `xml:"rss version,attr"` + Channel rssChannel `xml:"rss channel"` +} + +type rssChannel struct { + Title string `xml:"rss title"` + Link string `xml:"rss link"` + ImageURL string `xml:"rss image>url"` + Language string `xml:"rss language"` + Description string `xml:"rss description"` + PubDate string `xml:"rss pubDate"` + ManagingEditor string `xml:"rss managingEditor"` + Webmaster string `xml:"rss webMaster"` + TimeToLive rssTTL `xml:"rss ttl"` + Items []rssItem `xml:"rss item"` + AtomLinks PodcastFeedElement } @@ -72,15 +77,15 @@ func (r *rssFeed) Transform(baseURL string) *model.Feed { feed.FeedURL = feedURL } - feed.Title = html.UnescapeString(strings.TrimSpace(r.Title)) + feed.Title = html.UnescapeString(strings.TrimSpace(r.Channel.Title)) if feed.Title == "" { feed.Title = feed.SiteURL } - feed.IconURL = strings.TrimSpace(r.ImageURL) - feed.TTL = r.TimeToLive.Value() + feed.IconURL = strings.TrimSpace(r.Channel.ImageURL) + feed.TTL = r.Channel.TimeToLive.Value() - for _, item := range r.Items { + for _, item := range r.Channel.Items { entry := item.Transform() if entry.Author == "" { entry.Author = r.feedAuthor() @@ -110,32 +115,29 @@ func (r *rssFeed) Transform(baseURL string) *model.Feed { } func (r *rssFeed) siteURL() string { - for _, element := range r.Links { - if element.XMLName.Space == "" { - return strings.TrimSpace(element.Data) - } - } - - return "" + return strings.TrimSpace(r.Channel.Link) } func (r *rssFeed) feedURL() string { - for _, element := range r.Links { - if element.XMLName.Space == "http://www.w3.org/2005/Atom" { - return strings.TrimSpace(element.Href) + for _, atomLink := range r.Channel.AtomLinks.Links { + if atomLink.Rel == "self" { + return strings.TrimSpace(atomLink.URL) } } - return "" } func (r rssFeed) feedAuthor() string { - author := r.PodcastAuthor() + author := r.Channel.PodcastAuthor() switch { - case r.ManagingEditor != "": - author = r.ManagingEditor - case r.Webmaster != "": - author = r.Webmaster + case r.Channel.ManagingEditor != "": + author = r.Channel.ManagingEditor + case r.Channel.Webmaster != "": + author = r.Channel.Webmaster + case r.Channel.GooglePlayAuthor != "": + author = r.Channel.GooglePlayAuthor + case r.Channel.PodcastOwner.String() != "": + author = r.Channel.PodcastOwner.String() } return sanitizer.StripTags(strings.TrimSpace(author)) } @@ -146,27 +148,7 @@ type rssGUID struct { IsPermaLink string `xml:"isPermaLink,attr"` } -type rssLink struct { - XMLName xml.Name - Data string `xml:",chardata"` - Href string `xml:"href,attr"` - Rel string `xml:"rel,attr"` -} - -type rssCommentLink struct { - XMLName xml.Name - Data string `xml:",chardata"` -} - type rssAuthor struct { - XMLName xml.Name - Data string `xml:",chardata"` - Name string `xml:"name"` - Email string `xml:"email"` - Inner string `xml:",innerxml"` -} - -type rssTitle struct { XMLName xml.Name Data string `xml:",chardata"` Inner string `xml:",innerxml"` @@ -193,19 +175,21 @@ func (enclosure *rssEnclosure) Size() int64 { } type rssItem struct { - GUID rssGUID `xml:"guid"` - Title []rssTitle `xml:"title"` - Links []rssLink `xml:"link"` - Description string `xml:"description"` - PubDate string `xml:"pubDate"` - Authors []rssAuthor `xml:"author"` - CommentLinks []rssCommentLink `xml:"comments"` - EnclosureLinks []rssEnclosure `xml:"enclosure"` - Categories []rssCategory `xml:"category"` + GUID rssGUID `xml:"rss guid"` + Title string `xml:"rss title"` + Link string `xml:"rss link"` + Description string `xml:"rss description"` + PubDate string `xml:"rss pubDate"` + Author rssAuthor `xml:"rss author"` + Comments string `xml:"rss comments"` + EnclosureLinks []rssEnclosure `xml:"rss enclosure"` + Categories []rssCategory `xml:"rss category"` dublincore.DublinCoreItemElement FeedBurnerElement PodcastEntryElement media.Element + AtomAuthor + AtomLinks } func (r *rssItem) Transform() *model.Entry { @@ -250,34 +234,26 @@ func (r *rssItem) entryDate() time.Time { } func (r *rssItem) entryAuthor() string { - author := "" + var author string - for _, rssAuthor := range r.Authors { - switch rssAuthor.XMLName.Space { - case "http://www.itunes.com/dtds/podcast-1.0.dtd", "http://www.google.com/schemas/play-podcasts/1.0": - author = rssAuthor.Data - case "http://www.w3.org/2005/Atom": - if rssAuthor.Name != "" { - author = rssAuthor.Name - } else if rssAuthor.Email != "" { - author = rssAuthor.Email - } - default: - if rssAuthor.Name != "" { - author = rssAuthor.Name - } else if strings.Contains(rssAuthor.Inner, "