Add scraper rules

This commit is contained in:
Frédéric Guillot 2017-12-10 20:51:04 -08:00
parent 7a35c58f53
commit 87ccad5c7f
16 changed files with 140 additions and 34 deletions

View File

@ -1,5 +1,5 @@
// Code generated by go generate; DO NOT EDIT. // Code generated by go generate; DO NOT EDIT.
// 2017-12-10 18:56:24.387844114 -0800 PST m=+0.029823201 // 2017-12-10 20:08:14.447304303 -0800 PST m=+0.040286758
package locale package locale
@ -167,12 +167,13 @@ var translations = map[string]string{
"Activate Fever API": "Activer l'API de Fever", "Activate Fever API": "Activer l'API de Fever",
"Fever Username": "Nom d'utilisateur pour l'API de Fever", "Fever Username": "Nom d'utilisateur pour l'API de Fever",
"Fever Password": "Mot de passe pour l'API de Fever", "Fever Password": "Mot de passe pour l'API de Fever",
"Fetch original content": "Récupérer le contenu original" "Fetch original content": "Récupérer le contenu original",
"Scraper Rules": "Règles pour récupérer le contenu original"
} }
`, `,
} }
var translationsChecksums = map[string]string{ var translationsChecksums = map[string]string{
"en_US": "6fe95384260941e8a5a3c695a655a932e0a8a6a572c1e45cb2b1ae8baa01b897", "en_US": "6fe95384260941e8a5a3c695a655a932e0a8a6a572c1e45cb2b1ae8baa01b897",
"fr_FR": "fd629b171aefa50dd0a6100acaac8fbecbdf1a1d53e3fce984234565ec5bb5d5", "fr_FR": "4426cea875ee2c9acb1a2b0619cb82f3a32f71aabe5d07657eaf2f6b7387c5f9",
} }

View File

@ -151,5 +151,6 @@
"Activate Fever API": "Activer l'API de Fever", "Activate Fever API": "Activer l'API de Fever",
"Fever Username": "Nom d'utilisateur pour l'API de Fever", "Fever Username": "Nom d'utilisateur pour l'API de Fever",
"Fever Password": "Mot de passe pour l'API de Fever", "Fever Password": "Mot de passe pour l'API de Fever",
"Fetch original content": "Récupérer le contenu original" "Fetch original content": "Récupérer le contenu original",
"Scraper Rules": "Règles pour récupérer le contenu original"
} }

View File

@ -22,6 +22,7 @@ type Feed struct {
LastModifiedHeader string `json:"last_modified_header,omitempty"` LastModifiedHeader string `json:"last_modified_header,omitempty"`
ParsingErrorMsg string `json:"parsing_error_message,omitempty"` ParsingErrorMsg string `json:"parsing_error_message,omitempty"`
ParsingErrorCount int `json:"parsing_error_count,omitempty"` ParsingErrorCount int `json:"parsing_error_count,omitempty"`
ScraperRules string `json:"scraper_rules"`
Category *Category `json:"category,omitempty"` Category *Category `json:"category,omitempty"`
Entries Entries `json:"entries,omitempty"` Entries Entries `json:"entries,omitempty"`
Icon *FeedIcon `json:"icon,omitempty"` Icon *FeedIcon `json:"icon,omitempty"`

16
reader/scraper/rules.go Normal file
View File

@ -0,0 +1,16 @@
// Copyright 2017 Frédéric Guillot. All rights reserved.
// Use of this source code is governed by the Apache 2.0
// license that can be found in the LICENSE file.
package scraper
// List of predefined scraper rules (alphabetically sorted)
// domain => CSS selectors
var predefinedRules = map[string]string{
"lemonde.fr": "div#articleBody",
"lesjoiesducode.fr": ".blog-post-content img",
"linux.com": "div.content, div[property]",
"opensource.com": "div[property]",
"phoronix.com": "div.content",
"techcrunch.com": "div.article-entry",
}

View File

@ -6,14 +6,19 @@ package scraper
import ( import (
"errors" "errors"
"io"
"log"
"strings"
"github.com/PuerkitoBio/goquery"
"github.com/miniflux/miniflux2/http" "github.com/miniflux/miniflux2/http"
"github.com/miniflux/miniflux2/reader/readability" "github.com/miniflux/miniflux2/reader/readability"
"github.com/miniflux/miniflux2/reader/sanitizer" "github.com/miniflux/miniflux2/reader/sanitizer"
"github.com/miniflux/miniflux2/url"
) )
// Fetch download a web page a returns relevant contents. // Fetch download a web page a returns relevant contents.
func Fetch(websiteURL string) (string, error) { func Fetch(websiteURL, rules string) (string, error) {
client := http.NewClient(websiteURL) client := http.NewClient(websiteURL)
response, err := client.Get() response, err := client.Get()
if err != nil { if err != nil {
@ -29,10 +34,57 @@ func Fetch(websiteURL string) (string, error) {
return "", err return "", err
} }
content, err := readability.ExtractContent(page) var content string
if rules == "" {
rules = getPredefinedScraperRules(websiteURL)
}
if rules != "" {
log.Printf(`[Scraper] Using rules "%s" for "%s"`, rules, websiteURL)
content, err = scrapContent(page, rules)
} else {
log.Printf(`[Scraper] Using readability for "%s"`, websiteURL)
content, err = readability.ExtractContent(page)
}
if err != nil { if err != nil {
return "", err return "", err
} }
return sanitizer.Sanitize(websiteURL, content), nil return sanitizer.Sanitize(websiteURL, content), nil
} }
func scrapContent(page io.Reader, rules string) (string, error) {
document, err := goquery.NewDocumentFromReader(page)
if err != nil {
return "", err
}
contents := ""
document.Find(rules).Each(func(i int, s *goquery.Selection) {
var content string
// For some inline elements, we get the parent.
if s.Is("img") {
content, _ = s.Parent().Html()
} else {
content, _ = s.Html()
}
contents += content
})
return contents, nil
}
func getPredefinedScraperRules(websiteURL string) string {
urlDomain := url.Domain(websiteURL)
for domain, rules := range predefinedRules {
if strings.Contains(urlDomain, domain) {
return rules
}
}
return ""
}

View File

@ -0,0 +1,21 @@
// Copyright 2017 Frédéric Guillot. All rights reserved.
// Use of this source code is governed by the Apache 2.0
// license that can be found in the LICENSE file.
package scraper
import "testing"
func TestGetPredefinedRules(t *testing.T) {
if getPredefinedScraperRules("http://www.phoronix.com/") == "" {
t.Error("Unable to find rule for phoronix.com")
}
if getPredefinedScraperRules("https://www.linux.com/") == "" {
t.Error("Unable to find rule for linux.com")
}
if getPredefinedScraperRules("https://example.org/") != "" {
t.Error("A rule not defined should not return anything")
}
}

View File

@ -45,6 +45,9 @@
<label for="form-feed-url">{{ t "Feed URL" }}</label> <label for="form-feed-url">{{ t "Feed URL" }}</label>
<input type="url" name="feed_url" id="form-feed-url" placeholder="https://domain.tld/" value="{{ .form.FeedURL }}" required> <input type="url" name="feed_url" id="form-feed-url" placeholder="https://domain.tld/" value="{{ .form.FeedURL }}" required>
<label for="form-scraper-rules">{{ t "Scraper Rules" }}</label>
<input type="text" name="scraper_rules" id="form-scraper-rules" value="{{ .form.ScraperRules }}">
<label for="form-category">{{ t "Category" }}</label> <label for="form-category">{{ t "Category" }}</label>
<select id="form-category" name="category_id"> <select id="form-category" name="category_id">
{{ range .categories }} {{ range .categories }}

View File

@ -1,5 +1,5 @@
// Code generated by go generate; DO NOT EDIT. // Code generated by go generate; DO NOT EDIT.
// 2017-12-10 18:56:24.375327888 -0800 PST m=+0.017306975 // 2017-12-10 20:08:14.428877093 -0800 PST m=+0.021859548
package template package template
@ -395,6 +395,9 @@ var templateViewsMap = map[string]string{
<label for="form-feed-url">{{ t "Feed URL" }}</label> <label for="form-feed-url">{{ t "Feed URL" }}</label>
<input type="url" name="feed_url" id="form-feed-url" placeholder="https://domain.tld/" value="{{ .form.FeedURL }}" required> <input type="url" name="feed_url" id="form-feed-url" placeholder="https://domain.tld/" value="{{ .form.FeedURL }}" required>
<label for="form-scraper-rules">{{ t "Scraper Rules" }}</label>
<input type="text" name="scraper_rules" id="form-scraper-rules" value="{{ .form.ScraperRules }}">
<label for="form-category">{{ t "Category" }}</label> <label for="form-category">{{ t "Category" }}</label>
<select id="form-category" name="category_id"> <select id="form-category" name="category_id">
{{ range .categories }} {{ range .categories }}
@ -1181,7 +1184,7 @@ var templateViewsMapChecksums = map[string]string{
"create_category": "2b82af5d2dcd67898dc5daa57a6461e6ff8121a6089b2a2a1be909f35e4a2275", "create_category": "2b82af5d2dcd67898dc5daa57a6461e6ff8121a6089b2a2a1be909f35e4a2275",
"create_user": "45e226df757126d5fe7c464e295e9a34f07952cfdb71e31e49839850d35af139", "create_user": "45e226df757126d5fe7c464e295e9a34f07952cfdb71e31e49839850d35af139",
"edit_category": "cee720faadcec58289b707ad30af623d2ee66c1ce23a732965463250d7ff41c5", "edit_category": "cee720faadcec58289b707ad30af623d2ee66c1ce23a732965463250d7ff41c5",
"edit_feed": "c5bc4c22bf7e8348d880395250545595d21fb8c8e723fc5d7cca68e25d250884", "edit_feed": "b3c7dd5e93d58e051abcd59da31217d8e9b50587014b895d1b7c9172247b35f8",
"edit_user": "82d9749d76ddbd2352816d813c4b1f6d92f2222de678b4afe5821090246735c7", "edit_user": "82d9749d76ddbd2352816d813c4b1f6d92f2222de678b4afe5821090246735c7",
"entry": "ebcf9bb35812dd02759718f7f7411267e6a6c8efd59a9aa0a0e735bcb88efeff", "entry": "ebcf9bb35812dd02759718f7f7411267e6a6c8efd59a9aa0a0e735bcb88efeff",
"feed_entries": "547c19eb36b20e350ce70ed045173b064cdcd6b114afb241c9f2dda9d88fcc27", "feed_entries": "547c19eb36b20e350ce70ed045173b064cdcd6b114afb241c9f2dda9d88fcc27",

View File

@ -40,18 +40,14 @@ func (c *Controller) FetchContent(ctx *core.Context, request *core.Request, resp
return return
} }
content, err := scraper.Fetch(entry.URL) content, err := scraper.Fetch(entry.URL, entry.Feed.ScraperRules)
if err != nil { if err != nil {
response.JSON().ServerError(err) response.JSON().ServerError(err)
return return
} }
if len(content) > len(entry.Content) { entry.Content = content
entry.Content = content c.store.UpdateEntryContent(entry)
c.store.UpdateEntryContent(entry)
} else {
content = entry.Content
}
response.JSON().Created(map[string]string{"content": content}) response.JSON().Created(map[string]string{"content": content})
} }

View File

@ -217,10 +217,11 @@ func (c *Controller) getFeedFormTemplateArgs(ctx *core.Context, user *model.User
if feedForm == nil { if feedForm == nil {
args["form"] = form.FeedForm{ args["form"] = form.FeedForm{
SiteURL: feed.SiteURL, SiteURL: feed.SiteURL,
FeedURL: feed.FeedURL, FeedURL: feed.FeedURL,
Title: feed.Title, Title: feed.Title,
CategoryID: feed.Category.ID, ScraperRules: feed.ScraperRules,
CategoryID: feed.Category.ID,
} }
} else { } else {
args["form"] = feedForm args["form"] = feedForm

View File

@ -14,10 +14,11 @@ import (
// FeedForm represents a feed form in the UI // FeedForm represents a feed form in the UI
type FeedForm struct { type FeedForm struct {
FeedURL string FeedURL string
SiteURL string SiteURL string
Title string Title string
CategoryID int64 ScraperRules string
CategoryID int64
} }
// ValidateModification validates FeedForm fields // ValidateModification validates FeedForm fields
@ -34,6 +35,7 @@ func (f FeedForm) Merge(feed *model.Feed) *model.Feed {
feed.Title = f.Title feed.Title = f.Title
feed.SiteURL = f.SiteURL feed.SiteURL = f.SiteURL
feed.FeedURL = f.FeedURL feed.FeedURL = f.FeedURL
feed.ScraperRules = f.ScraperRules
feed.ParsingErrorCount = 0 feed.ParsingErrorCount = 0
feed.ParsingErrorMsg = "" feed.ParsingErrorMsg = ""
return feed return feed
@ -47,9 +49,10 @@ func NewFeedForm(r *http.Request) *FeedForm {
} }
return &FeedForm{ return &FeedForm{
FeedURL: r.FormValue("feed_url"), FeedURL: r.FormValue("feed_url"),
SiteURL: r.FormValue("site_url"), SiteURL: r.FormValue("site_url"),
Title: r.FormValue("title"), Title: r.FormValue("title"),
CategoryID: int64(categoryID), ScraperRules: r.FormValue("scraper_rules"),
CategoryID: int64(categoryID),
} }
} }

1
sql/schema_version_6.sql Normal file
View File

@ -0,0 +1 @@
alter table feeds add column scraper_rules text default '';

View File

@ -1,5 +1,5 @@
// Code generated by go generate; DO NOT EDIT. // Code generated by go generate; DO NOT EDIT.
// 2017-12-10 18:56:24.36359961 -0800 PST m=+0.005578697 // 2017-12-10 20:08:14.411225368 -0800 PST m=+0.004207823
package sql package sql
@ -136,6 +136,8 @@ alter table users add column entry_direction entry_sorting_direction default 'as
fever_token text default '', fever_token text default '',
primary key(user_id) primary key(user_id)
) )
`,
"schema_version_6": `alter table feeds add column scraper_rules text default '';
`, `,
} }
@ -145,4 +147,5 @@ var SqlMapChecksums = map[string]string{
"schema_version_3": "a54745dbc1c51c000f74d4e5068f1e2f43e83309f023415b1749a47d5c1e0f12", "schema_version_3": "a54745dbc1c51c000f74d4e5068f1e2f43e83309f023415b1749a47d5c1e0f12",
"schema_version_4": "216ea3a7d3e1704e40c797b5dc47456517c27dbb6ca98bf88812f4f63d74b5d9", "schema_version_4": "216ea3a7d3e1704e40c797b5dc47456517c27dbb6ca98bf88812f4f63d74b5d9",
"schema_version_5": "46397e2f5f2c82116786127e9f6a403e975b14d2ca7b652a48cd1ba843e6a27c", "schema_version_5": "46397e2f5f2c82116786127e9f6a403e975b14d2ca7b652a48cd1ba843e6a27c",
"schema_version_6": "9d05b4fb223f0e60efc716add5048b0ca9c37511cf2041721e20505d6d798ce4",
} }

View File

@ -152,7 +152,7 @@ func (e *EntryQueryBuilder) GetEntries() (model.Entries, error) {
SELECT SELECT
e.id, e.user_id, e.feed_id, e.hash, e.published_at at time zone '%s', e.title, e.url, e.author, e.content, e.status, e.id, e.user_id, e.feed_id, e.hash, e.published_at at time zone '%s', e.title, e.url, e.author, e.content, e.status,
f.title as feed_title, f.feed_url, f.site_url, f.checked_at, f.title as feed_title, f.feed_url, f.site_url, f.checked_at,
f.category_id, c.title as category_title, f.category_id, c.title as category_title, f.scraper_rules,
fi.icon_id fi.icon_id
FROM entries e FROM entries e
LEFT JOIN feeds f ON f.id=e.feed_id LEFT JOIN feeds f ON f.id=e.feed_id
@ -197,6 +197,7 @@ func (e *EntryQueryBuilder) GetEntries() (model.Entries, error) {
&entry.Feed.CheckedAt, &entry.Feed.CheckedAt,
&entry.Feed.Category.ID, &entry.Feed.Category.ID,
&entry.Feed.Category.Title, &entry.Feed.Category.Title,
&entry.Feed.ScraperRules,
&iconID, &iconID,
) )

View File

@ -52,7 +52,7 @@ func (s *Storage) Feeds(userID int64) (model.Feeds, error) {
feeds := make(model.Feeds, 0) feeds := make(model.Feeds, 0)
query := `SELECT query := `SELECT
f.id, f.feed_url, f.site_url, f.title, f.etag_header, f.last_modified_header, f.id, f.feed_url, f.site_url, f.title, f.etag_header, f.last_modified_header,
f.user_id, f.checked_at, f.parsing_error_count, f.parsing_error_msg, f.user_id, f.checked_at, f.parsing_error_count, f.parsing_error_msg, f.scraper_rules,
f.category_id, c.title as category_title, f.category_id, c.title as category_title,
fi.icon_id fi.icon_id
FROM feeds f FROM feeds f
@ -84,6 +84,7 @@ func (s *Storage) Feeds(userID int64) (model.Feeds, error) {
&feed.CheckedAt, &feed.CheckedAt,
&feed.ParsingErrorCount, &feed.ParsingErrorCount,
&errorMsg, &errorMsg,
&feed.ScraperRules,
&feed.Category.ID, &feed.Category.ID,
&feed.Category.Title, &feed.Category.Title,
&iconID, &iconID,
@ -122,7 +123,7 @@ func (s *Storage) FeedByID(userID, feedID int64) (*model.Feed, error) {
query := ` query := `
SELECT SELECT
f.id, f.feed_url, f.site_url, f.title, f.etag_header, f.last_modified_header, f.id, f.feed_url, f.site_url, f.title, f.etag_header, f.last_modified_header,
f.user_id, f.checked_at, f.parsing_error_count, f.parsing_error_msg, f.user_id, f.checked_at, f.parsing_error_count, f.parsing_error_msg, f.scraper_rules,
f.category_id, c.title as category_title f.category_id, c.title as category_title
FROM feeds f FROM feeds f
LEFT JOIN categories c ON c.id=f.category_id LEFT JOIN categories c ON c.id=f.category_id
@ -139,6 +140,7 @@ func (s *Storage) FeedByID(userID, feedID int64) (*model.Feed, error) {
&feed.CheckedAt, &feed.CheckedAt,
&feed.ParsingErrorCount, &feed.ParsingErrorCount,
&feed.ParsingErrorMsg, &feed.ParsingErrorMsg,
&feed.ScraperRules,
&feed.Category.ID, &feed.Category.ID,
&feed.Category.Title, &feed.Category.Title,
) )
@ -195,8 +197,8 @@ func (s *Storage) UpdateFeed(feed *model.Feed) (err error) {
query := `UPDATE feeds SET query := `UPDATE feeds SET
feed_url=$1, site_url=$2, title=$3, category_id=$4, etag_header=$5, last_modified_header=$6, checked_at=$7, feed_url=$1, site_url=$2, title=$3, category_id=$4, etag_header=$5, last_modified_header=$6, checked_at=$7,
parsing_error_msg=$8, parsing_error_count=$9 parsing_error_msg=$8, parsing_error_count=$9, scraper_rules=$10
WHERE id=$10 AND user_id=$11` WHERE id=$11 AND user_id=$12`
_, err = s.db.Exec(query, _, err = s.db.Exec(query,
feed.FeedURL, feed.FeedURL,
@ -208,6 +210,7 @@ func (s *Storage) UpdateFeed(feed *model.Feed) (err error) {
feed.CheckedAt, feed.CheckedAt,
feed.ParsingErrorMsg, feed.ParsingErrorMsg,
feed.ParsingErrorCount, feed.ParsingErrorCount,
feed.ScraperRules,
feed.ID, feed.ID,
feed.UserID, feed.UserID,
) )

View File

@ -12,7 +12,7 @@ import (
"github.com/miniflux/miniflux2/sql" "github.com/miniflux/miniflux2/sql"
) )
const schemaVersion = 5 const schemaVersion = 6
// Migrate run database migrations. // Migrate run database migrations.
func (s *Storage) Migrate() { func (s *Storage) Migrate() {