Handle RSS entries with only a GUID permalink

This commit is contained in:
Frédéric Guillot 2022-10-09 16:51:39 -07:00
parent cd7f01f573
commit d947b0194b
2 changed files with 47 additions and 2 deletions

View File

@ -95,6 +95,10 @@ func TestParseRss2Sample(t *testing.T) {
if feed.Entries[0].Content != `How do Americans get ready to work with Russians aboard the International Space Station? They take a crash course in culture, language and protocol at Russia's <a href="http://howe.iki.rssi.ru/GCTC/gctc_e.htm">Star City</a>.` {
t.Errorf("Incorrect entry content, got: %s", feed.Entries[0].Content)
}
if feed.Entries[1].URL != "http://liftoff.msfc.nasa.gov/2003/05/30.html#item572" {
t.Errorf("Incorrect entry URL, got: %s", feed.Entries[1].URL)
}
}
func TestParseFeedWithoutTitle(t *testing.T) {
@ -230,6 +234,34 @@ func TestParseEntryWithoutLink(t *testing.T) {
}
}
func TestParseEntryWithOnlyGuidPermalink(t *testing.T) {
data := `<?xml version="1.0" encoding="utf-8"?>
<rss version="2.0">
<channel>
<link>https://example.org/</link>
<item>
<guid isPermaLink="true">https://example.org/some-article.html</guid>
</item>
<item>
<guid>https://example.org/another-article.html</guid>
</item>
</channel>
</rss>`
feed, err := Parse("https://example.org/", bytes.NewBufferString(data))
if err != nil {
t.Fatal(err)
}
if feed.Entries[0].URL != "https://example.org/some-article.html" {
t.Errorf("Incorrect entry link, got: %s", feed.Entries[0].URL)
}
if feed.Entries[1].URL != "https://example.org/another-article.html" {
t.Errorf("Incorrect entry link, got: %s", feed.Entries[1].URL)
}
}
func TestParseEntryWithAtomLink(t *testing.T) {
data := `<?xml version="1.0" encoding="utf-8"?>
<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom">

View File

@ -118,6 +118,12 @@ func (r rssFeed) feedAuthor() string {
return sanitizer.StripTags(strings.TrimSpace(author))
}
type rssGUID struct {
XMLName xml.Name
Data string `xml:",chardata"`
IsPermaLink string `xml:"isPermaLink,attr"`
}
type rssLink struct {
XMLName xml.Name
Data string `xml:",chardata"`
@ -159,7 +165,7 @@ func (enclosure *rssEnclosure) Size() int64 {
}
type rssItem struct {
GUID string `xml:"guid"`
GUID rssGUID `xml:"guid"`
Title []rssTitle `xml:"title"`
Links []rssLink `xml:"link"`
Description string `xml:"description"`
@ -237,7 +243,7 @@ func (r *rssItem) entryAuthor() string {
}
func (r *rssItem) entryHash() string {
for _, value := range []string{r.GUID, r.entryURL()} {
for _, value := range []string{r.GUID.Data, r.entryURL()} {
if value != "" {
return crypto.Hash(value)
}
@ -291,6 +297,13 @@ func (r *rssItem) entryURL() string {
}
}
// Specs: https://cyber.harvard.edu/rss/rss.html#ltguidgtSubelementOfLtitemgt
// isPermaLink is optional, its default value is true.
// If its value is false, the guid may not be assumed to be a url, or a url to anything in particular.
if r.GUID.IsPermaLink == "true" || r.GUID.IsPermaLink == "" {
return strings.TrimSpace(r.GUID.Data)
}
return ""
}