Use truncated entry description as title if unavailable

This commit is contained in:
Frédéric Guillot 2022-03-04 16:49:44 -08:00
parent c9e0f0b3e4
commit 1eb01b39e7
10 changed files with 314 additions and 24 deletions

View File

@ -60,6 +60,10 @@ func (a *atom03Feed) Transform(baseURL string) *model.Feed {
item.Author = a.Author.String()
}
if item.Title == "" {
item.Title = sanitizer.TruncateHTML(item.Content, 100)
}
if item.Title == "" {
item.Title = item.URL
}

View File

@ -98,7 +98,7 @@ func TestParseAtom03WithoutFeedTitle(t *testing.T) {
}
}
func TestParseAtom03WithoutEntryTitle(t *testing.T) {
func TestParseAtom03WithoutEntryTitleButWithLink(t *testing.T) {
data := `<?xml version="1.0" encoding="utf-8"?>
<feed version="0.3" xmlns="http://purl.org/atom/ns#">
<title>dive into mark</title>
@ -125,6 +125,62 @@ func TestParseAtom03WithoutEntryTitle(t *testing.T) {
}
}
func TestParseAtom03WithoutEntryTitleButWithSummary(t *testing.T) {
data := `<?xml version="1.0" encoding="utf-8"?>
<feed version="0.3" xmlns="http://purl.org/atom/ns#">
<title>dive into mark</title>
<link rel="alternate" type="text/html" href="http://diveintomark.org/"/>
<modified>2003-12-13T18:30:02Z</modified>
<author><name>Mark Pilgrim</name></author>
<entry>
<link rel="alternate" type="text/html" href="http://diveintomark.org/2003/12/13/atom03"/>
<id>tag:diveintomark.org,2003:3.2397</id>
<summary type="text/plain">It&apos;s a test</summary>
</entry>
</feed>`
feed, err := Parse("http://diveintomark.org/", bytes.NewBufferString(data))
if err != nil {
t.Fatal(err)
}
if len(feed.Entries) != 1 {
t.Errorf("Incorrect number of entries, got: %d", len(feed.Entries))
}
if feed.Entries[0].Title != "It's a test" {
t.Errorf("Incorrect entry title, got: %s", feed.Entries[0].Title)
}
}
func TestParseAtom03WithoutEntryTitleButWithXMLContent(t *testing.T) {
data := `<?xml version="1.0" encoding="utf-8"?>
<feed version="0.3" xmlns="http://purl.org/atom/ns#">
<title>dive into mark</title>
<link rel="alternate" type="text/html" href="http://diveintomark.org/"/>
<modified>2003-12-13T18:30:02Z</modified>
<author><name>Mark Pilgrim</name></author>
<entry>
<link rel="alternate" type="text/html" href="http://diveintomark.org/2003/12/13/atom03"/>
<id>tag:diveintomark.org,2003:3.2397</id>
<content mode="xml" type="text/html"><p>Some text.</p></content>
</entry>
</feed>`
feed, err := Parse("http://diveintomark.org/", bytes.NewBufferString(data))
if err != nil {
t.Fatal(err)
}
if len(feed.Entries) != 1 {
t.Errorf("Incorrect number of entries, got: %d", len(feed.Entries))
}
if feed.Entries[0].Title != "Some text." {
t.Errorf("Incorrect entry title, got: %s", feed.Entries[0].Title)
}
}
func TestParseAtom03WithSummaryOnly(t *testing.T) {
data := `<?xml version="1.0" encoding="utf-8"?>
<feed version="0.3" xmlns="http://purl.org/atom/ns#">

View File

@ -16,6 +16,7 @@ import (
"miniflux.app/model"
"miniflux.app/reader/date"
"miniflux.app/reader/media"
"miniflux.app/reader/sanitizer"
"miniflux.app/url"
)
@ -64,6 +65,10 @@ func (a *atom10Feed) Transform(baseURL string) *model.Feed {
item.Author = a.Authors.String()
}
if item.Title == "" {
item.Title = sanitizer.TruncateHTML(item.Content, 100)
}
if item.Title == "" {
item.Title = item.URL
}

View File

@ -100,7 +100,37 @@ func TestParseFeedWithoutTitle(t *testing.T) {
}
}
func TestParseEntryWithoutTitle(t *testing.T) {
func TestParseEntryWithoutTitleButWithURL(t *testing.T) {
data := `<?xml version="1.0" encoding="utf-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
<title>Example Feed</title>
<link href="http://example.org/"/>
<updated>2003-12-13T18:30:02Z</updated>
<author>
<name>John Doe</name>
</author>
<id>urn:uuid:60a76c80-d399-11d9-b93C-0003939e0af6</id>
<entry>
<link href="http://example.org/2003/12/13/atom03"/>
<id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>
<updated>2003-12-13T18:30:02Z</updated>
</entry>
</feed>`
feed, err := Parse("https://example.org/", bytes.NewBufferString(data))
if err != nil {
t.Fatal(err)
}
if feed.Entries[0].Title != "http://example.org/2003/12/13/atom03" {
t.Errorf("Incorrect entry title, got: %s", feed.Entries[0].Title)
}
}
func TestParseEntryWithoutTitleButWithSummary(t *testing.T) {
data := `<?xml version="1.0" encoding="utf-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
@ -126,7 +156,40 @@ func TestParseEntryWithoutTitle(t *testing.T) {
t.Fatal(err)
}
if feed.Entries[0].Title != "http://example.org/2003/12/13/atom03" {
if feed.Entries[0].Title != "Some text." {
t.Errorf("Incorrect entry title, got: %s", feed.Entries[0].Title)
}
}
func TestParseEntryWithoutTitleButWithXHTMLContent(t *testing.T) {
data := `<?xml version="1.0" encoding="utf-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
<title>Example Feed</title>
<link href="http://example.org/"/>
<updated>2003-12-13T18:30:02Z</updated>
<author>
<name>John Doe</name>
</author>
<id>urn:uuid:60a76c80-d399-11d9-b93C-0003939e0af6</id>
<entry>
<link href="http://example.org/2003/12/13/atom03"/>
<id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>
<updated>2003-12-13T18:30:02Z</updated>
<content type="xhtml">
<div xmlns="http://www.w3.org/1999/xhtml">AT&amp;T bought <b>by SBC</b>!</div>
</content>
</entry>
</feed>`
feed, err := Parse("https://example.org/", bytes.NewBufferString(data))
if err != nil {
t.Fatal(err)
}
if feed.Entries[0].Title != "AT&T bought by SBC!" {
t.Errorf("Incorrect entry title, got: %s", feed.Entries[0].Title)
}
}

View File

@ -12,6 +12,7 @@ import (
"miniflux.app/logger"
"miniflux.app/model"
"miniflux.app/reader/date"
"miniflux.app/reader/sanitizer"
"miniflux.app/url"
)
@ -130,9 +131,13 @@ func (j *jsonItem) GetHash() string {
}
func (j *jsonItem) GetTitle() string {
for _, value := range []string{j.Title, j.Summary, j.Text, j.URL} {
if j.Title != "" {
return j.Title
}
for _, value := range []string{j.Summary, j.Text, j.HTML} {
if value != "" {
return truncate(value)
return sanitizer.TruncateHTML(value, 100)
}
}
@ -186,16 +191,3 @@ func getAuthor(author jsonAuthor) string {
return ""
}
func truncate(str string) string {
max := 100
str = strings.TrimSpace(str)
// Convert to runes to be safe with unicode
runes := []rune(str)
if len(runes) > max {
return string(runes[:max]) + "…"
}
return str
}

View File

@ -76,7 +76,7 @@ func TestParseJsonFeed(t *testing.T) {
t.Errorf("Incorrect entry URL, got: %s", feed.Entries[1].URL)
}
if feed.Entries[1].Title != "https://example.org/initial-post" {
if feed.Entries[1].Title != "Hello, world!" {
t.Errorf(`Incorrect entry title, got: "%s"`, feed.Entries[1].Title)
}
@ -398,7 +398,7 @@ func TestParseFeedItemWithoutID(t *testing.T) {
}
}
func TestParseFeedItemWithoutTitle(t *testing.T) {
func TestParseFeedItemWithoutTitleButWithURL(t *testing.T) {
data := `{
"version": "https://jsonfeed.org/version/1",
"title": "My Example Feed",
@ -425,7 +425,7 @@ func TestParseFeedItemWithoutTitle(t *testing.T) {
}
}
func TestParseTruncateItemTitle(t *testing.T) {
func TestParseFeedItemWithoutTitleButWithSummary(t *testing.T) {
data := `{
"version": "https://jsonfeed.org/version/1",
"title": "My Example Feed",
@ -433,7 +433,61 @@ func TestParseTruncateItemTitle(t *testing.T) {
"feed_url": "https://example.org/feed.json",
"items": [
{
"title": "` + strings.Repeat("a", 200) + `"
"summary": "This is some text content."
}
]
}`
feed, err := Parse("https://example.org/feed.json", bytes.NewBufferString(data))
if err != nil {
t.Fatal(err)
}
if len(feed.Entries) != 1 {
t.Errorf("Incorrect number of entries, got: %d", len(feed.Entries))
}
if feed.Entries[0].Title != "This is some text content." {
t.Errorf("Incorrect entry title, got: %s", feed.Entries[0].Title)
}
}
func TestParseFeedItemWithoutTitleButWithHTMLContent(t *testing.T) {
data := `{
"version": "https://jsonfeed.org/version/1",
"title": "My Example Feed",
"home_page_url": "https://example.org/",
"feed_url": "https://example.org/feed.json",
"items": [
{
"content_html": "This is <strong>HTML</strong>."
}
]
}`
feed, err := Parse("https://example.org/feed.json", bytes.NewBufferString(data))
if err != nil {
t.Fatal(err)
}
if len(feed.Entries) != 1 {
t.Errorf("Incorrect number of entries, got: %d", len(feed.Entries))
}
if feed.Entries[0].Title != "This is HTML." {
t.Errorf("Incorrect entry title, got: %s", feed.Entries[0].Title)
}
}
func TestParseFeedItemWithoutTitleButWithTextContent(t *testing.T) {
data := `{
"version": "https://jsonfeed.org/version/1",
"title": "My Example Feed",
"home_page_url": "https://example.org/",
"feed_url": "https://example.org/feed.json",
"items": [
{
"content_text": "` + strings.Repeat("a", 200) + `"
}
]
}`
@ -448,7 +502,7 @@ func TestParseTruncateItemTitle(t *testing.T) {
}
if len(feed.Entries[0].Title) != 103 {
t.Errorf("Incorrect entry title, got: %s", feed.Entries[0].Title)
t.Errorf("Incorrect entry title, got: %d", len(feed.Entries[0].Title))
}
if len([]rune(feed.Entries[0].Title)) != 101 {

View File

@ -115,7 +115,7 @@ func TestParseFeedWithoutTitle(t *testing.T) {
}
}
func TestParseEntryWithoutTitle(t *testing.T) {
func TestParseEntryWithoutTitleAndDescription(t *testing.T) {
data := `<?xml version="1.0" encoding="utf-8"?>
<rss version="2.0">
<channel>
@ -136,6 +136,30 @@ func TestParseEntryWithoutTitle(t *testing.T) {
}
}
func TestParseEntryWithoutTitleButWithDescription(t *testing.T) {
data := `<?xml version="1.0" encoding="utf-8"?>
<rss version="2.0">
<channel>
<link>https://example.org/</link>
<item>
<link>https://example.org/item</link>
<description>
This is the description
</description>
</item>
</channel>
</rss>`
feed, err := Parse("https://example.org/", bytes.NewBufferString(data))
if err != nil {
t.Fatal(err)
}
if feed.Entries[0].Title != "This is the description" {
t.Errorf("Incorrect entry title, got: %s", feed.Entries[0].Title)
}
}
func TestParseEntryWithMediaTitle(t *testing.T) {
data := `<?xml version="1.0" encoding="utf-8"?>
<rss version="2.0" xmlns:media="http://search.yahoo.com/mrss/">

View File

@ -73,6 +73,10 @@ func (r *rssFeed) Transform(baseURL string) *model.Feed {
}
}
if entry.Title == "" {
entry.Title = sanitizer.TruncateHTML(entry.Content, 100)
}
if entry.Title == "" {
entry.Title = entry.URL
}

View File

@ -0,0 +1,23 @@
// Copyright 2022 Frédéric Guillot. All rights reserved.
// Use of this source code is governed by the Apache 2.0
// license that can be found in the LICENSE file.
package sanitizer
import "strings"
func TruncateHTML(input string, max int) string {
text := StripTags(input)
text = strings.ReplaceAll(text, "\n", " ")
text = strings.ReplaceAll(text, "\t", " ")
text = strings.ReplaceAll(text, " ", " ")
text = strings.TrimSpace(text)
// Convert to runes to be safe with unicode
runes := []rune(text)
if len(runes) > max {
return strings.TrimSpace(string(runes[:max])) + "…"
}
return text
}

View File

@ -0,0 +1,65 @@
// Copyright 2022 Frédéric Guillot. All rights reserved.
// Use of this source code is governed by the Apache 2.0
// license that can be found in the LICENSE file.
package sanitizer
import "testing"
func TestTruncateHTMWithTextLowerThanLimitL(t *testing.T) {
input := `This is a <strong>bug 🐛</strong>.`
expected := `This is a bug 🐛.`
output := TruncateHTML(input, 50)
if expected != output {
t.Errorf(`Wrong output: %q != %q`, expected, output)
}
}
func TestTruncateHTMLWithTextAboveLimit(t *testing.T) {
input := `This is <strong>HTML</strong>.`
expected := `This…`
output := TruncateHTML(input, 4)
if expected != output {
t.Errorf(`Wrong output: %q != %q`, expected, output)
}
}
func TestTruncateHTMLWithUnicodeTextAboveLimit(t *testing.T) {
input := `This is a <strong>bike 🚲</strong>.`
expected := `This…`
output := TruncateHTML(input, 4)
if expected != output {
t.Errorf(`Wrong output: %q != %q`, expected, output)
}
}
func TestTruncateHTMLWithMultilineTextAboveLimit(t *testing.T) {
input := `
This is a <strong>bike
🚲</strong>.
`
expected := `This is a bike…`
output := TruncateHTML(input, 15)
if expected != output {
t.Errorf(`Wrong output: %q != %q`, expected, output)
}
}
func TestTruncateHTMLWithMultilineTextLowerThanLimit(t *testing.T) {
input := `
This is a <strong>bike
🚲</strong>.
`
expected := `This is a bike 🚲.`
output := TruncateHTML(input, 20)
if expected != output {
t.Errorf(`Wrong output: %q != %q`, expected, output)
}
}