From de7a61309878e335fe99c7afbbe6a40cc097b0ae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20Guillot?= Date: Wed, 18 Nov 2020 17:29:40 -0800 Subject: [PATCH] Calculate reading time during feed processing The goal is to speed up the user interface. Detecting the language based on the content is pretty slow. --- client/core.go | 29 ++++++++++---------- database/migration.go | 2 +- database/sql.go | 2 ++ database/sql/schema_version_41.sql | 1 + model/entry.go | 1 + reader/processor/processor.go | 22 ++++++++++++++++ storage/entry.go | 41 ++++++++++++++++++++++++----- storage/entry_query_builder.go | 2 ++ template/common.go | 6 ++--- template/engine.go | 3 --- template/functions.go | 21 --------------- template/html/common/item_meta.html | 4 +-- 12 files changed, 84 insertions(+), 50 deletions(-) create mode 100644 database/sql/schema_version_41.sql diff --git a/client/core.go b/client/core.go index c2aabf4f..b521729b 100644 --- a/client/core.go +++ b/client/core.go @@ -129,20 +129,21 @@ type Feeds []*Feed // Entry represents a subscription item in the system. type Entry struct { - ID int64 `json:"id"` - UserID int64 `json:"user_id"` - FeedID int64 `json:"feed_id"` - Status string `json:"status"` - Hash string `json:"hash"` - Title string `json:"title"` - URL string `json:"url"` - Date time.Time `json:"published_at"` - Content string `json:"content"` - Author string `json:"author"` - ShareCode string `json:"share_code"` - Starred bool `json:"starred"` - Enclosures Enclosures `json:"enclosures,omitempty"` - Feed *Feed `json:"feed,omitempty"` + ID int64 `json:"id"` + UserID int64 `json:"user_id"` + FeedID int64 `json:"feed_id"` + Status string `json:"status"` + Hash string `json:"hash"` + Title string `json:"title"` + URL string `json:"url"` + Date time.Time `json:"published_at"` + Content string `json:"content"` + Author string `json:"author"` + ShareCode string `json:"share_code"` + Starred bool `json:"starred"` + ReadingTime int `json:"reading_time"` + Enclosures Enclosures `json:"enclosures,omitempty"` + Feed *Feed `json:"feed,omitempty"` } // Entries represents a list of entries. diff --git a/database/migration.go b/database/migration.go index a1f415ed..8c291edb 100644 --- a/database/migration.go +++ b/database/migration.go @@ -12,7 +12,7 @@ import ( "miniflux.app/logger" ) -const schemaVersion = 40 +const schemaVersion = 41 // Migrate executes database migrations. func Migrate(db *sql.DB) { diff --git a/database/sql.go b/database/sql.go index 0ab746f6..5bf464c8 100644 --- a/database/sql.go +++ b/database/sql.go @@ -203,6 +203,7 @@ alter table users add column entry_direction entry_sorting_direction default 'as add column keeplist_rules text not null default '' ; `, + "schema_version_41": `alter table entries add column reading_time int not null default 0;`, "schema_version_5": `create table integrations ( user_id int not null, pinboard_enabled bool default 'f', @@ -264,6 +265,7 @@ var SqlMapChecksums = map[string]string{ "schema_version_39": "b0f90b97502921d4681a07c64d180a91a0b4ccac7d3c1dbe30519ad6f1bf1737", "schema_version_4": "216ea3a7d3e1704e40c797b5dc47456517c27dbb6ca98bf88812f4f63d74b5d9", "schema_version_40": "6a8fec92399f853ed6817aff4cfa43255dce4c19afad796e41519d09de62105e", + "schema_version_41": "128e118ce61267ea1f6ae03b63a6d4734eae87e520b00e309ad083f1f6afdfe5", "schema_version_5": "46397e2f5f2c82116786127e9f6a403e975b14d2ca7b652a48cd1ba843e6a27c", "schema_version_6": "9d05b4fb223f0e60efc716add5048b0ca9c37511cf2041721e20505d6d798ce4", "schema_version_7": "33f298c9aa30d6de3ca28e1270df51c2884d7596f1283a75716e2aeb634cd05c", diff --git a/database/sql/schema_version_41.sql b/database/sql/schema_version_41.sql new file mode 100644 index 00000000..6b9c43dd --- /dev/null +++ b/database/sql/schema_version_41.sql @@ -0,0 +1 @@ +alter table entries add column reading_time int not null default 0; \ No newline at end of file diff --git a/model/entry.go b/model/entry.go index 54dfe03f..abd303ed 100644 --- a/model/entry.go +++ b/model/entry.go @@ -33,6 +33,7 @@ type Entry struct { Author string `json:"author"` ShareCode string `json:"share_code"` Starred bool `json:"starred"` + ReadingTime int `json:"reading_time"` Enclosures EnclosureList `json:"enclosures,omitempty"` Feed *Feed `json:"feed,omitempty"` } diff --git a/reader/processor/processor.go b/reader/processor/processor.go index c06435a1..4b3dc429 100644 --- a/reader/processor/processor.go +++ b/reader/processor/processor.go @@ -5,8 +5,11 @@ package processor import ( + "math" "regexp" + "strings" "time" + "unicode/utf8" "miniflux.app/config" "miniflux.app/logger" @@ -16,6 +19,8 @@ import ( "miniflux.app/reader/sanitizer" "miniflux.app/reader/scraper" "miniflux.app/storage" + + "github.com/rylans/getlang" ) // ProcessFeedEntries downloads original web page for entries and apply filters. @@ -58,6 +63,7 @@ func ProcessFeedEntries(store *storage.Storage, feed *model.Feed) { // The sanitizer should always run at the end of the process to make sure unsafe HTML is filtered. entry.Content = sanitizer.Sanitize(entry.URL, entry.Content) + entry.ReadingTime = calculateReadingTime(entry.Content) filteredEntries = append(filteredEntries, entry) } @@ -108,7 +114,23 @@ func ProcessEntryWebPage(entry *model.Entry) error { if content != "" { entry.Content = content + entry.ReadingTime = calculateReadingTime(content) } return nil } + +func calculateReadingTime(content string) int { + sanitizedContent := sanitizer.StripTags(content) + languageInfo := getlang.FromString(sanitizedContent) + + var timeToReadInt int + if languageInfo.LanguageCode() == "ko" || languageInfo.LanguageCode() == "zh" || languageInfo.LanguageCode() == "jp" { + timeToReadInt = int(math.Ceil(float64(utf8.RuneCountInString(sanitizedContent)) / 500)) + } else { + nbOfWords := len(strings.Fields(sanitizedContent)) + timeToReadInt = int(math.Ceil(float64(nbOfWords) / 265)) + } + + return timeToReadInt +} diff --git a/storage/entry.go b/storage/entry.go index e1a85af4..66ce0225 100644 --- a/storage/entry.go +++ b/storage/entry.go @@ -75,11 +75,11 @@ func (s *Storage) UpdateEntryContent(entry *model.Entry) error { UPDATE entries SET - content=$1 + content=$1, reading_time=$2 WHERE - id=$2 AND user_id=$3 + id=$3 AND user_id=$4 ` - _, err = tx.Exec(query, entry.Content, entry.ID, entry.UserID) + _, err = tx.Exec(query, entry.Content, entry.ReadingTime, entry.ID, entry.UserID) if err != nil { tx.Rollback() return fmt.Errorf(`store: unable to update content of entry #%d: %v`, entry.ID, err) @@ -106,9 +106,35 @@ func (s *Storage) UpdateEntryContent(entry *model.Entry) error { func (s *Storage) createEntry(tx *sql.Tx, entry *model.Entry) error { query := ` INSERT INTO entries - (title, hash, url, comments_url, published_at, content, author, user_id, feed_id, changed_at, document_vectors) + ( + title, + hash, + url, + comments_url, + published_at, + content, + author, + user_id, + feed_id, + reading_time, + changed_at, + document_vectors + ) VALUES - ($1, $2, $3, $4, $5, $6, $7, $8, $9, now(), setweight(to_tsvector(substring(coalesce($1, '') for 1000000)), 'A') || setweight(to_tsvector(substring(coalesce($6, '') for 1000000)), 'B')) + ( + $1, + $2, + $3, + $4, + $5, + $6, + $7, + $8, + $9, + $10, + now(), + setweight(to_tsvector(substring(coalesce($1, '') for 1000000)), 'A') || setweight(to_tsvector(substring(coalesce($6, '') for 1000000)), 'B') + ) RETURNING id, status ` @@ -123,6 +149,7 @@ func (s *Storage) createEntry(tx *sql.Tx, entry *model.Entry) error { entry.Author, entry.UserID, entry.FeedID, + entry.ReadingTime, ).Scan(&entry.ID, &entry.Status) if err != nil { @@ -154,9 +181,10 @@ func (s *Storage) updateEntry(tx *sql.Tx, entry *model.Entry) error { comments_url=$3, content=$4, author=$5, + reading_time=$6, document_vectors = setweight(to_tsvector(substring(coalesce($1, '') for 1000000)), 'A') || setweight(to_tsvector(substring(coalesce($4, '') for 1000000)), 'B') WHERE - user_id=$6 AND feed_id=$7 AND hash=$8 + user_id=$7 AND feed_id=$8 AND hash=$9 RETURNING id ` @@ -167,6 +195,7 @@ func (s *Storage) updateEntry(tx *sql.Tx, entry *model.Entry) error { entry.CommentsURL, entry.Content, entry.Author, + entry.ReadingTime, entry.UserID, entry.FeedID, entry.Hash, diff --git a/storage/entry_query_builder.go b/storage/entry_query_builder.go index d7ab042f..58b7e576 100644 --- a/storage/entry_query_builder.go +++ b/storage/entry_query_builder.go @@ -226,6 +226,7 @@ func (e *EntryQueryBuilder) GetEntries() (model.Entries, error) { e.content, e.status, e.starred, + e.reading_time, f.title as feed_title, f.feed_url, f.site_url, @@ -284,6 +285,7 @@ func (e *EntryQueryBuilder) GetEntries() (model.Entries, error) { &entry.Content, &entry.Status, &entry.Starred, + &entry.ReadingTime, &entry.Feed.Title, &entry.Feed.FeedURL, &entry.Feed.SiteURL, diff --git a/template/common.go b/template/common.go index 3139dd36..7b35f46b 100644 --- a/template/common.go +++ b/template/common.go @@ -242,10 +242,10 @@ SOFTWARE.
  • - {{ if .user.ShowReadingTime }} + {{ if and .user.ShowReadingTime (gt .entry.ReadingTime 0) }}
  • - {{ plural "entry.estimated_reading_time" (timeToRead .entry.Content) (timeToRead .entry.Content) }} + {{ plural "entry.estimated_reading_time" .entry.ReadingTime .entry.ReadingTime }}
  • {{ end }} @@ -523,7 +523,7 @@ var templateCommonMapChecksums = map[string]string{ "feed_list": "931e43d328a116318c510de5658c688cd940b934c86b6ec82a472e1f81e020ae", "feed_menu": "318d8662dda5ca9dfc75b909c8461e79c86fb5082df1428f67aaf856f19f4b50", "icons": "9a41753778072f286216085d8712495e2ccca20c7a24f5c982775436a3d38579", - "item_meta": "eb72c6e2a924759af20b8ef41f2ce7495aedc053181c2e5ca1b063f9410c58b0", + "item_meta": "56ab09d7dd46eeb2e2ee11ddcec0c157a5832c896dbd2887d9e2b013680b2af6", "layout": "65767e7dbebe1f7ed42895ecd5a737b0693e4a2ec35e84e3e391f462beb11977", "pagination": "7b61288e86283c4cf0dc83bcbf8bf1c00c7cb29e60201c8c0b633b2450d2911f", "settings_menu": "e2b777630c0efdbc529800303c01d6744ed3af80ec505ac5a5b3f99c9b989156", diff --git a/template/engine.go b/template/engine.go index 8af76e52..c7f75660 100644 --- a/template/engine.go +++ b/template/engine.go @@ -65,9 +65,6 @@ func (e *Engine) Render(name, language string, data interface{}) []byte { "plural": func(key string, n int, args ...interface{}) string { return printer.Plural(key, n, args...) }, - "timeToRead": func(content string) int { - return timeToRead(content) - }, }) var b bytes.Buffer diff --git a/template/functions.go b/template/functions.go index fc03e122..7fff2eac 100644 --- a/template/functions.go +++ b/template/functions.go @@ -11,19 +11,16 @@ import ( "net/mail" "strings" "time" - "unicode/utf8" "miniflux.app/config" "miniflux.app/http/route" "miniflux.app/locale" "miniflux.app/model" "miniflux.app/proxy" - "miniflux.app/reader/sanitizer" "miniflux.app/timezone" "miniflux.app/url" "github.com/gorilla/mux" - "github.com/rylans/getlang" ) type funcMap struct { @@ -94,9 +91,6 @@ func (f *funcMap) Map() template.FuncMap { "plural": func(key string, n int, args ...interface{}) string { return "" }, - "timeToRead": func(content string) int { - return 0 - }, } } @@ -195,18 +189,3 @@ func formatFileSize(b int64) string { return fmt.Sprintf("%.1f %ciB", float64(b)/float64(div), "KMGTPE"[exp]) } - -func timeToRead(content string) int { - sanitizedContent := sanitizer.StripTags(content) - languageInfo := getlang.FromString(sanitizedContent) - - var timeToReadInt int - if languageInfo.LanguageCode() == "ko" || languageInfo.LanguageCode() == "zh" || languageInfo.LanguageCode() == "jp" { - timeToReadInt = int(math.Ceil(float64(utf8.RuneCountInString(sanitizedContent)) / 500)) - } else { - nbOfWords := len(strings.Fields(sanitizedContent)) - timeToReadInt = int(math.Ceil(float64(nbOfWords) / 265)) - } - - return timeToReadInt -} diff --git a/template/html/common/item_meta.html b/template/html/common/item_meta.html index c31ea955..f92e138e 100644 --- a/template/html/common/item_meta.html +++ b/template/html/common/item_meta.html @@ -7,10 +7,10 @@
  • - {{ if .user.ShowReadingTime }} + {{ if and .user.ShowReadingTime (gt .entry.ReadingTime 0) }}
  • - {{ plural "entry.estimated_reading_time" (timeToRead .entry.Content) (timeToRead .entry.Content) }} + {{ plural "entry.estimated_reading_time" .entry.ReadingTime .entry.ReadingTime }}
  • {{ end }}