From d947b0194b5563601552047261aeae04ac6708c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20Guillot?= Date: Sun, 9 Oct 2022 16:51:39 -0700 Subject: [PATCH] Handle RSS entries with only a GUID permalink --- reader/rss/parser_test.go | 32 ++++++++++++++++++++++++++++++++ reader/rss/rss.go | 17 +++++++++++++++-- 2 files changed, 47 insertions(+), 2 deletions(-) diff --git a/reader/rss/parser_test.go b/reader/rss/parser_test.go index 9be293b4..4b7bf761 100644 --- a/reader/rss/parser_test.go +++ b/reader/rss/parser_test.go @@ -95,6 +95,10 @@ func TestParseRss2Sample(t *testing.T) { if feed.Entries[0].Content != `How do Americans get ready to work with Russians aboard the International Space Station? They take a crash course in culture, language and protocol at Russia's Star City.` { t.Errorf("Incorrect entry content, got: %s", feed.Entries[0].Content) } + + if feed.Entries[1].URL != "http://liftoff.msfc.nasa.gov/2003/05/30.html#item572" { + t.Errorf("Incorrect entry URL, got: %s", feed.Entries[1].URL) + } } func TestParseFeedWithoutTitle(t *testing.T) { @@ -230,6 +234,34 @@ func TestParseEntryWithoutLink(t *testing.T) { } } +func TestParseEntryWithOnlyGuidPermalink(t *testing.T) { + data := ` + + + https://example.org/ + + https://example.org/some-article.html + + + https://example.org/another-article.html + + + ` + + feed, err := Parse("https://example.org/", bytes.NewBufferString(data)) + if err != nil { + t.Fatal(err) + } + + if feed.Entries[0].URL != "https://example.org/some-article.html" { + t.Errorf("Incorrect entry link, got: %s", feed.Entries[0].URL) + } + + if feed.Entries[1].URL != "https://example.org/another-article.html" { + t.Errorf("Incorrect entry link, got: %s", feed.Entries[1].URL) + } +} + func TestParseEntryWithAtomLink(t *testing.T) { data := ` diff --git a/reader/rss/rss.go b/reader/rss/rss.go index fb042632..76ed9c9a 100644 --- a/reader/rss/rss.go +++ b/reader/rss/rss.go @@ -118,6 +118,12 @@ func (r rssFeed) feedAuthor() string { return sanitizer.StripTags(strings.TrimSpace(author)) } +type rssGUID struct { + XMLName xml.Name + Data string `xml:",chardata"` + IsPermaLink string `xml:"isPermaLink,attr"` +} + type rssLink struct { XMLName xml.Name Data string `xml:",chardata"` @@ -159,7 +165,7 @@ func (enclosure *rssEnclosure) Size() int64 { } type rssItem struct { - GUID string `xml:"guid"` + GUID rssGUID `xml:"guid"` Title []rssTitle `xml:"title"` Links []rssLink `xml:"link"` Description string `xml:"description"` @@ -237,7 +243,7 @@ func (r *rssItem) entryAuthor() string { } func (r *rssItem) entryHash() string { - for _, value := range []string{r.GUID, r.entryURL()} { + for _, value := range []string{r.GUID.Data, r.entryURL()} { if value != "" { return crypto.Hash(value) } @@ -291,6 +297,13 @@ func (r *rssItem) entryURL() string { } } + // Specs: https://cyber.harvard.edu/rss/rss.html#ltguidgtSubelementOfLtitemgt + // isPermaLink is optional, its default value is true. + // If its value is false, the guid may not be assumed to be a url, or a url to anything in particular. + if r.GUID.IsPermaLink == "true" || r.GUID.IsPermaLink == "" { + return strings.TrimSpace(r.GUID.Data) + } + return "" }