Refactor RSS parser to use default namespace

This change avoid some limitations of the Go XML parser regarding XML namespaces
2024-03-11 20:43:14 -07:00 · 2024-03-11 20:43:14 -07:00 · 9a637ce95e
parent d3a85b049b
commit 9a637ce95e
6 changed files with 185 additions and 181 deletions
--- a/internal/reader/media/media.go
+++ b/internal/reader/media/media.go
@ -12,6 +12,7 @@ import (
 var textLinkRegex = regexp.MustCompile(`(?mi)(\bhttps?:\/\/[-A-Z0-9+&@#\/%?=~_|!:,.;]*[-A-Z0-9+&@#\/%=~_|])`)
 // Element represents XML media elements.
 // Specs: https://www.rssboard.org/media-rss
 type Element struct {
 	MediaGroups       []Group         `xml:"http://search.yahoo.com/mrss/ group"`
 	MediaContents     []Content       `xml:"http://search.yahoo.com/mrss/ content"`
--- a/internal/reader/rss/atom.go
+++ b/internal/reader/rss/atom.go
@ -0,0 +1,43 @@
 // SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
 // SPDX-License-Identifier: Apache-2.0
 package rss // import "miniflux.app/v2/internal/reader/rss"
 import "strings"
 type AtomAuthor struct {
 	Author AtomPerson `xml:"http://www.w3.org/2005/Atom author"`
 }
 func (a *AtomAuthor) String() string {
 	return a.Author.String()
 }
 type AtomPerson struct {
 	Name  string `xml:"name"`
 	Email string `xml:"email"`
 }
 func (a *AtomPerson) String() string {
 	var name string
 	switch {
 	case a.Name != "":
 		name = a.Name
 	case a.Email != "":
 		name = a.Email
 	}
 	return strings.TrimSpace(name)
 }
 type AtomLink struct {
 	URL    string `xml:"href,attr"`
 	Type   string `xml:"type,attr"`
 	Rel    string `xml:"rel,attr"`
 	Length string `xml:"length,attr"`
 }
 type AtomLinks struct {
 	Links []*AtomLink `xml:"http://www.w3.org/2005/Atom link"`
 }
--- a/internal/reader/rss/parser.go
+++ b/internal/reader/rss/parser.go
@ -14,7 +14,9 @@ import (
 // Parse returns a normalized feed struct from a RSS feed.
 func Parse(baseURL string, data io.ReadSeeker) (*model.Feed, error) {
 	feed := new(rssFeed)
-	if err := xml.NewXMLDecoder(data).Decode(feed); err != nil {
+	decoder := xml.NewXMLDecoder(data)
 	decoder.DefaultSpace = "rss"
 	if err := decoder.Decode(feed); err != nil {
 		return nil, fmt.Errorf("rss: unable to parse feed: %w", err)
 	}
 	return feed.Transform(baseURL), nil
--- a/internal/reader/rss/parser_test.go
+++ b/internal/reader/rss/parser_test.go
@ -300,7 +300,7 @@ func TestParseEntryWithMultipleAtomLinks(t *testing.T) {
 			<item>
 				<title>Test</title>
 				<atom:link rel="payment" href="https://example.org/a" />
-				<atom:link rel="http://foobar.tld" href="https://example.org/b" />
+				<atom:link rel="alternate" href="https://example.org/b" />
 			</item>
 		</channel>
 		</rss>`
@ -430,7 +430,7 @@ func TestParseEntryWithAuthorAndCDATA(t *testing.T) {
 				<title>Test</title>
 				<link>https://example.org/item</link>
 				<author>
-					by <![CDATA[Foo Bar]]>
+					<![CDATA[by Foo Bar]]>
 				</author>
 			</item>
 		</channel>
@ -447,38 +447,6 @@ func TestParseEntryWithAuthorAndCDATA(t *testing.T) {
 	}
 }
 func TestParseEntryWithNonStandardAtomAuthor(t *testing.T) {
 	data := `<?xml version="1.0" encoding="utf-8"?>
 		<rss xmlns:atom="http://www.w3.org/2005/Atom" version="2.0">
 		<channel>
 			<title>Example</title>
 			<link>https://example.org/</link>
 			<atom:link href="https://example.org/rss" type="application/rss+xml" rel="self"></atom:link>
 			<item>
 				<title>Test</title>
 				<link>https://example.org/item</link>
 				<author xmlns:author="http://www.w3.org/2005/Atom">
 					<name>Foo Bar</name>
 					<title>Vice President</title>
 					<department/>
 					<company>FooBar Inc.</company>
 				</author>
 			</item>
 		</channel>
 		</rss>`
 	feed, err := Parse("https://example.org/", bytes.NewReader([]byte(data)))
 	if err != nil {
 		t.Fatal(err)
 	}
 	expected := "Foo Bar"
 	result := feed.Entries[0].Author
 	if result != expected {
 		t.Errorf("Incorrect entry author, got %q instead of %q", result, expected)
 	}
 }
 func TestParseEntryWithAtomAuthorEmail(t *testing.T) {
 	data := `<?xml version="1.0" encoding="utf-8"?>
 		<rss xmlns:atom="http://www.w3.org/2005/Atom" version="2.0">
@ -508,7 +476,7 @@ func TestParseEntryWithAtomAuthorEmail(t *testing.T) {
 	}
 }
-func TestParseEntryWithAtomAuthor(t *testing.T) {
+func TestParseEntryWithAtomAuthorName(t *testing.T) {
 	data := `<?xml version="1.0" encoding="utf-8"?>
 		<rss xmlns:atom="http://www.w3.org/2005/Atom" version="2.0">
 		<channel>
@ -1435,6 +1403,37 @@ func TestEntryDescriptionFromGooglePlayDescription(t *testing.T) {
 	}
 }
 func TestParseEntryWithRSSDescriptionAndMediaDescription(t *testing.T) {
 	data := `<?xml version="1.0" encoding="UTF-8"?>
 	<rss version="2.0" xmlns:media="http://search.yahoo.com/mrss/">
 		<channel>
 			<title>Podcast Example</title>
 			<link>http://www.example.com/index.html</link>
 			<item>
 				<title>Entry Title</title>
 				<link>http://www.example.com/entries/1</link>
 				<description>Entry Description</description>
 				<media:description type="plain">Media Description</media:description>
 			</item>
 		</channel>
 	</rss>`
 	feed, err := Parse("https://example.org/", bytes.NewReader([]byte(data)))
 	if err != nil {
 		t.Fatal(err)
 	}
 	if len(feed.Entries) != 1 {
 		t.Errorf("Incorrect number of entries, got: %d", len(feed.Entries))
 	}
 	expected := "Entry Description"
 	result := feed.Entries[0].Content
 	if expected != result {
 		t.Errorf(`Unexpected description, got %q instead of %q`, result, expected)
 	}
 }
 func TestParseEntryWithCategoryAndInnerHTML(t *testing.T) {
 	data := `<?xml version="1.0" encoding="utf-8"?>
 		<rss xmlns:atom="http://www.w3.org/2005/Atom" version="2.0">
--- a/internal/reader/rss/podcast.go
+++ b/internal/reader/rss/podcast.go
@ -15,21 +15,24 @@ var ErrInvalidDurationFormat = errors.New("rss: invalid duration format")
 // PodcastFeedElement represents iTunes and GooglePlay feed XML elements.
 // Specs:
 // - https://github.com/simplepie/simplepie-ng/wiki/Spec:-iTunes-Podcast-RSS
-// - https://developers.google.com/search/reference/podcast/rss-feed
+// - https://support.google.com/podcast-publishers/answer/9889544
 type PodcastFeedElement struct {
-	ItunesAuthor     string       `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd channel>author"`
+	ItunesAuthor     string       `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd author"`
-	Subtitle         string       `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd channel>subtitle"`
+	Subtitle         string       `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd subtitle"`
-	Summary          string       `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd channel>summary"`
+	Summary          string       `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd summary"`
-	PodcastOwner     PodcastOwner `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd channel>owner"`
+	PodcastOwner     PodcastOwner `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd owner"`
-	GooglePlayAuthor string       `xml:"http://www.google.com/schemas/play-podcasts/1.0 channel>author"`
+	GooglePlayAuthor string       `xml:"http://www.google.com/schemas/play-podcasts/1.0 author"`
 }
 // PodcastEntryElement represents iTunes and GooglePlay entry XML elements.
 type PodcastEntryElement struct {
-	Subtitle              string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd subtitle"`
+	ItunesAuthor          string       `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd author"`
-	Summary               string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd summary"`
+	Subtitle              string       `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd subtitle"`
-	Duration              string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd duration"`
+	Summary               string       `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd summary"`
-	GooglePlayDescription string `xml:"http://www.google.com/schemas/play-podcasts/1.0 description"`
+	Duration              string       `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd duration"`
 	PodcastOwner          PodcastOwner `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd owner"`
 	GooglePlayAuthor      string       `xml:"http://www.google.com/schemas/play-podcasts/1.0 author"`
 	GooglePlayDescription string       `xml:"http://www.google.com/schemas/play-podcasts/1.0 description"`
 }
 // PodcastOwner represents contact information for the podcast owner.
@ -38,6 +41,19 @@ type PodcastOwner struct {
 	Email string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd email"`
 }
 func (p *PodcastOwner) String() string {
 	var name string
 	switch {
 	case p.Name != "":
 		name = p.Name
 	case p.Email != "":
 		name = p.Email
 	}
 	return strings.TrimSpace(name)
 }
 // Image represents podcast artwork.
 type Image struct {
 	URL string `xml:"href,attr"`
@ -52,10 +68,8 @@ func (e *PodcastFeedElement) PodcastAuthor() string {
 		author = e.ItunesAuthor
 	case e.GooglePlayAuthor != "":
 		author = e.GooglePlayAuthor
-	case e.PodcastOwner.Name != "":
+	case e.PodcastOwner.String() != "":
-		author = e.PodcastOwner.Name
+		author = e.PodcastOwner.String()
 	case e.PodcastOwner.Email != "":
 		author = e.PodcastOwner.Email
 	}
 	return strings.TrimSpace(author)
--- a/internal/reader/rss/rss.go
+++ b/internal/reader/rss/rss.go
@ -21,20 +21,25 @@ import (
 	"miniflux.app/v2/internal/urllib"
 )
-// Specs: https://cyber.harvard.edu/rss/rss.html
+// Specs: https://www.rssboard.org/rss-specification
 type rssFeed struct {
-	XMLName        xml.Name  `xml:"rss"`
+	XMLName xml.Name   `xml:"rss"`
-	Version        string    `xml:"version,attr"`
+	Version string     `xml:"rss version,attr"`
-	Title          string    `xml:"channel>title"`
+	Channel rssChannel `xml:"rss channel"`
-	Links          []rssLink `xml:"channel>link"`
+}
-	ImageURL       string    `xml:"channel>image>url"`
+
-	Language       string    `xml:"channel>language"`
+type rssChannel struct {
-	Description    string    `xml:"channel>description"`
+	Title          string    `xml:"rss title"`
-	PubDate        string    `xml:"channel>pubDate"`
+	Link           string    `xml:"rss link"`
-	ManagingEditor string    `xml:"channel>managingEditor"`
+	ImageURL       string    `xml:"rss image>url"`
-	Webmaster      string    `xml:"channel>webMaster"`
+	Language       string    `xml:"rss language"`
-	TimeToLive     rssTTL    `xml:"channel>ttl"`
+	Description    string    `xml:"rss description"`
-	Items          []rssItem `xml:"channel>item"`
+	PubDate        string    `xml:"rss pubDate"`
 	ManagingEditor string    `xml:"rss managingEditor"`
 	Webmaster      string    `xml:"rss webMaster"`
 	TimeToLive     rssTTL    `xml:"rss ttl"`
 	Items          []rssItem `xml:"rss item"`
 	AtomLinks
 	PodcastFeedElement
 }
@ -72,15 +77,15 @@ func (r *rssFeed) Transform(baseURL string) *model.Feed {
 		feed.FeedURL = feedURL
 	}
-	feed.Title = html.UnescapeString(strings.TrimSpace(r.Title))
+	feed.Title = html.UnescapeString(strings.TrimSpace(r.Channel.Title))
 	if feed.Title == "" {
 		feed.Title = feed.SiteURL
 	}
-	feed.IconURL = strings.TrimSpace(r.ImageURL)
+	feed.IconURL = strings.TrimSpace(r.Channel.ImageURL)
-	feed.TTL = r.TimeToLive.Value()
+	feed.TTL = r.Channel.TimeToLive.Value()
-	for _, item := range r.Items {
+	for _, item := range r.Channel.Items {
 		entry := item.Transform()
 		if entry.Author == "" {
 			entry.Author = r.feedAuthor()
@ -110,32 +115,29 @@ func (r *rssFeed) Transform(baseURL string) *model.Feed {
 }
 func (r *rssFeed) siteURL() string {
-	for _, element := range r.Links {
+	return strings.TrimSpace(r.Channel.Link)
 		if element.XMLName.Space == "" {
 			return strings.TrimSpace(element.Data)
 		}
 	}
 	return ""
 }
 func (r *rssFeed) feedURL() string {
-	for _, element := range r.Links {
+	for _, atomLink := range r.Channel.AtomLinks.Links {
-		if element.XMLName.Space == "http://www.w3.org/2005/Atom" {
+		if atomLink.Rel == "self" {
-			return strings.TrimSpace(element.Href)
+			return strings.TrimSpace(atomLink.URL)
 		}
 	}
 	return ""
 }
 func (r rssFeed) feedAuthor() string {
-	author := r.PodcastAuthor()
+	author := r.Channel.PodcastAuthor()
 	switch {
-	case r.ManagingEditor != "":
+	case r.Channel.ManagingEditor != "":
-		author = r.ManagingEditor
+		author = r.Channel.ManagingEditor
-	case r.Webmaster != "":
+	case r.Channel.Webmaster != "":
-		author = r.Webmaster
+		author = r.Channel.Webmaster
 	case r.Channel.GooglePlayAuthor != "":
 		author = r.Channel.GooglePlayAuthor
 	case r.Channel.PodcastOwner.String() != "":
 		author = r.Channel.PodcastOwner.String()
 	}
 	return sanitizer.StripTags(strings.TrimSpace(author))
 }
@ -146,27 +148,7 @@ type rssGUID struct {
 	IsPermaLink string `xml:"isPermaLink,attr"`
 }
 type rssLink struct {
 	XMLName xml.Name
 	Data    string `xml:",chardata"`
 	Href    string `xml:"href,attr"`
 	Rel     string `xml:"rel,attr"`
 }
 type rssCommentLink struct {
 	XMLName xml.Name
 	Data    string `xml:",chardata"`
 }
 type rssAuthor struct {
 	XMLName xml.Name
 	Data    string `xml:",chardata"`
 	Name    string `xml:"name"`
 	Email   string `xml:"email"`
 	Inner   string `xml:",innerxml"`
 }
 type rssTitle struct {
 	XMLName xml.Name
 	Data    string `xml:",chardata"`
 	Inner   string `xml:",innerxml"`
@ -193,19 +175,21 @@ func (enclosure *rssEnclosure) Size() int64 {
 }
 type rssItem struct {
-	GUID           rssGUID          `xml:"guid"`
+	GUID           rssGUID        `xml:"rss guid"`
-	Title          []rssTitle       `xml:"title"`
+	Title          string         `xml:"rss title"`
-	Links          []rssLink        `xml:"link"`
+	Link           string         `xml:"rss link"`
-	Description    string           `xml:"description"`
+	Description    string         `xml:"rss description"`
-	PubDate        string           `xml:"pubDate"`
+	PubDate        string         `xml:"rss pubDate"`
-	Authors        []rssAuthor      `xml:"author"`
+	Author         rssAuthor      `xml:"rss author"`
-	CommentLinks   []rssCommentLink `xml:"comments"`
+	Comments       string         `xml:"rss comments"`
-	EnclosureLinks []rssEnclosure   `xml:"enclosure"`
+	EnclosureLinks []rssEnclosure `xml:"rss enclosure"`
-	Categories     []rssCategory    `xml:"category"`
+	Categories     []rssCategory  `xml:"rss category"`
 	dublincore.DublinCoreItemElement
 	FeedBurnerElement
 	PodcastEntryElement
 	media.Element
 	AtomAuthor
 	AtomLinks
 }
 func (r *rssItem) Transform() *model.Entry {
@ -250,34 +234,26 @@ func (r *rssItem) entryDate() time.Time {
 }
 func (r *rssItem) entryAuthor() string {
-	author := ""
+	var author string
-	for _, rssAuthor := range r.Authors {
+	switch {
-		switch rssAuthor.XMLName.Space {
+	case r.PodcastOwner.String() != "":
-		case "http://www.itunes.com/dtds/podcast-1.0.dtd", "http://www.google.com/schemas/play-podcasts/1.0":
+		author = r.PodcastOwner.String()
-			author = rssAuthor.Data
+	case r.GooglePlayAuthor != "":
-		case "http://www.w3.org/2005/Atom":
+		author = r.GooglePlayAuthor
-			if rssAuthor.Name != "" {
+	case r.ItunesAuthor != "":
-				author = rssAuthor.Name
+		author = r.ItunesAuthor
-			} else if rssAuthor.Email != "" {
+	case r.DublinCoreCreator != "":
-				author = rssAuthor.Email
+		author = r.DublinCoreCreator
-			}
+	case r.AtomAuthor.String() != "":
-		default:
+		author = r.AtomAuthor.String()
-			if rssAuthor.Name != "" {
+	case strings.Contains(r.Author.Inner, "<![CDATA["):
-				author = rssAuthor.Name
+		author = r.Author.Data
-			} else if strings.Contains(rssAuthor.Inner, "<![CDATA[") {
+	default:
-				author = rssAuthor.Data
+		author = r.Author.Inner
 			} else {
 				author = rssAuthor.Inner
 			}
 		}
 	}
-	if author == "" {
+	return strings.TrimSpace(sanitizer.StripTags(author))
 		author = r.GetSanitizedCreator()
 	}
 	return sanitizer.StripTags(strings.TrimSpace(author))
 }
 func (r *rssItem) entryHash() string {
@ -291,21 +267,10 @@ func (r *rssItem) entryHash() string {
 }
 func (r *rssItem) entryTitle() string {
-	var title string
+	title := r.Title
-	for _, rssTitle := range r.Title {
+	if r.DublinCoreTitle != "" {
-		switch rssTitle.XMLName.Space {
+		title = r.DublinCoreTitle
 		case "http://search.yahoo.com/mrss/":
 			// Ignore title in media namespace
 		case "http://purl.org/dc/elements/1.1/":
 			title = rssTitle.Data
 		default:
 			title = rssTitle.Data
 		}
 		if title != "" {
 			break
 		}
 	}
 	return html.UnescapeString(strings.TrimSpace(title))
@ -321,17 +286,15 @@ func (r *rssItem) entryContent() string {
 }
 func (r *rssItem) entryURL() string {
-	if r.FeedBurnerLink != "" {
+	for _, link := range []string{r.FeedBurnerLink, r.Link} {
-		return r.FeedBurnerLink
+		if link != "" {
 			return strings.TrimSpace(link)
 		}
 	}
-	for _, link := range r.Links {
+	for _, atomLink := range r.AtomLinks.Links {
-		if link.XMLName.Space == "http://www.w3.org/2005/Atom" && link.Href != "" && isValidLinkRelation(link.Rel) {
+		if atomLink.URL != "" && (strings.EqualFold(atomLink.Rel, "alternate") || atomLink.Rel == "") {
-			return strings.TrimSpace(link.Href)
+			return strings.TrimSpace(atomLink.URL)
 		}
 		if link.Data != "" {
 			return strings.TrimSpace(link.Data)
 		}
 	}
@ -425,28 +388,10 @@ func (r *rssItem) entryCategories() []string {
 }
 func (r *rssItem) entryCommentsURL() string {
-	for _, commentLink := range r.CommentLinks {
+	commentsURL := strings.TrimSpace(r.Comments)
-		if commentLink.XMLName.Space == "" {
+	if commentsURL != "" && urllib.IsAbsoluteURL(commentsURL) {
-			commentsURL := strings.TrimSpace(commentLink.Data)
+		return commentsURL
 			// The comments URL is supposed to be absolute (some feeds publishes incorrect comments URL)
 			// See https://cyber.harvard.edu/rss/rss.html#ltcommentsgtSubelementOfLtitemgt
 			if urllib.IsAbsoluteURL(commentsURL) {
 				return commentsURL
 			}
 		}
 	}
 	return ""
 }
 func isValidLinkRelation(rel string) bool {
 	switch rel {
 	case "", "alternate", "enclosure", "related", "self", "via":
 		return true
 	default:
 		if strings.HasPrefix(rel, "http") {
 			return true
 		}
 		return false
 	}
 }