From d0f99cee1af46be932b3b6b7e343a511eaa829a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20Guillot?= Date: Fri, 1 Dec 2023 16:27:18 -0800 Subject: [PATCH] Regression: ensure all HTML documents are encoded in UTF-8 Fixes #2196 --- internal/reader/atom/parser.go | 4 ++-- internal/reader/encoding/encoding.go | 9 +++++++-- internal/reader/icon/finder.go | 15 ++++++++++++--- internal/reader/icon/finder_test.go | 2 +- internal/reader/parser/format.go | 2 +- internal/reader/rdf/parser.go | 2 +- internal/reader/rss/parser.go | 2 +- internal/reader/scraper/scraper.go | 13 +++++++++++-- internal/reader/subscription/finder.go | 17 +++++++++++++---- internal/reader/subscription/finder_test.go | 20 ++++++++++---------- internal/reader/xml/decoder.go | 6 +++--- internal/reader/xml/decoder_test.go | 12 ++++++------ 12 files changed, 68 insertions(+), 36 deletions(-) diff --git a/internal/reader/atom/parser.go b/internal/reader/atom/parser.go index aab3e6a4..03aa9158 100644 --- a/internal/reader/atom/parser.go +++ b/internal/reader/atom/parser.go @@ -29,7 +29,7 @@ func Parse(baseURL string, r io.Reader) (*model.Feed, error) { rawFeed = new(atom10Feed) } - if err := xml_decoder.NewDecoder(&buf).Decode(rawFeed); err != nil { + if err := xml_decoder.NewXMLDecoder(&buf).Decode(rawFeed); err != nil { return nil, fmt.Errorf("atom: unable to parse feed: %w", err) } @@ -37,7 +37,7 @@ func Parse(baseURL string, r io.Reader) (*model.Feed, error) { } func getAtomFeedVersion(data io.Reader) string { - decoder := xml_decoder.NewDecoder(data) + decoder := xml_decoder.NewXMLDecoder(data) for { token, _ := decoder.Token() if token == nil { diff --git a/internal/reader/encoding/encoding.go b/internal/reader/encoding/encoding.go index a580a050..71f93543 100644 --- a/internal/reader/encoding/encoding.go +++ b/internal/reader/encoding/encoding.go @@ -22,7 +22,7 @@ import ( // - Feeds with encoding specified in both places // - Feeds with encoding specified only in XML document and not in HTTP header // - Feeds with wrong encoding defined and already in UTF-8 -func CharsetReader(label string, input io.Reader) (io.Reader, error) { +func CharsetReader(charsetLabel string, input io.Reader) (io.Reader, error) { buffer, _ := io.ReadAll(input) r := bytes.NewReader(buffer) @@ -33,5 +33,10 @@ func CharsetReader(label string, input io.Reader) (io.Reader, error) { } // Transform document to UTF-8 from the specified encoding in XML prolog. - return charset.NewReaderLabel(label, r) + return charset.NewReaderLabel(charsetLabel, r) +} + +// CharsetReaderFromContentType is used when the encoding is not specified for the input document. +func CharsetReaderFromContentType(contentType string, input io.Reader) (io.Reader, error) { + return charset.NewReader(input, contentType) } diff --git a/internal/reader/icon/finder.go b/internal/reader/icon/finder.go index 91a74d14..cb9c7d49 100644 --- a/internal/reader/icon/finder.go +++ b/internal/reader/icon/finder.go @@ -14,6 +14,7 @@ import ( "miniflux.app/v2/internal/config" "miniflux.app/v2/internal/crypto" "miniflux.app/v2/internal/model" + "miniflux.app/v2/internal/reader/encoding" "miniflux.app/v2/internal/reader/fetcher" "miniflux.app/v2/internal/urllib" @@ -110,7 +111,10 @@ func (f *IconFinder) FetchIconsFromHTMLDocument() (*model.Icon, error) { return nil, fmt.Errorf("icon: unable to download website index page: %w", localizedError.Error()) } - iconURLs, err := findIconURLsFromHTMLDocument(responseHandler.Body(config.Opts.HTTPClientMaxBodySize())) + iconURLs, err := findIconURLsFromHTMLDocument( + responseHandler.Body(config.Opts.HTTPClientMaxBodySize()), + responseHandler.ContentType(), + ) if err != nil { return nil, err } @@ -178,7 +182,7 @@ func (f *IconFinder) DownloadIcon(iconURL string) (*model.Icon, error) { return icon, nil } -func findIconURLsFromHTMLDocument(body io.Reader) ([]string, error) { +func findIconURLsFromHTMLDocument(body io.Reader, contentType string) ([]string, error) { queries := []string{ "link[rel='shortcut icon']", "link[rel='Shortcut Icon']", @@ -186,7 +190,12 @@ func findIconURLsFromHTMLDocument(body io.Reader) ([]string, error) { "link[rel='icon']", } - doc, err := goquery.NewDocumentFromReader(body) + htmlDocumentReader, err := encoding.CharsetReaderFromContentType(contentType, body) + if err != nil { + return nil, fmt.Errorf("icon: unable to create charset reader: %w", err) + } + + doc, err := goquery.NewDocumentFromReader(htmlDocumentReader) if err != nil { return nil, fmt.Errorf("icon: unable to read document: %v", err) } diff --git a/internal/reader/icon/finder_test.go b/internal/reader/icon/finder_test.go index a6f30738..9bb71126 100644 --- a/internal/reader/icon/finder_test.go +++ b/internal/reader/icon/finder_test.go @@ -112,7 +112,7 @@ func TestParseDocumentWithWhitespaceIconURL(t *testing.T) { /static/img/favicon.ico ">` - iconURLs, err := findIconURLsFromHTMLDocument(strings.NewReader(html)) + iconURLs, err := findIconURLsFromHTMLDocument(strings.NewReader(html), "text/html") if err != nil { t.Fatal(err) } diff --git a/internal/reader/parser/format.go b/internal/reader/parser/format.go index 1019d164..b0a7c2e3 100644 --- a/internal/reader/parser/format.go +++ b/internal/reader/parser/format.go @@ -30,7 +30,7 @@ func DetectFeedFormat(r io.ReadSeeker) string { } r.Seek(0, io.SeekStart) - decoder := rxml.NewDecoder(r) + decoder := rxml.NewXMLDecoder(r) for { token, _ := decoder.Token() diff --git a/internal/reader/rdf/parser.go b/internal/reader/rdf/parser.go index 1ce8b16b..feb069b5 100644 --- a/internal/reader/rdf/parser.go +++ b/internal/reader/rdf/parser.go @@ -14,7 +14,7 @@ import ( // Parse returns a normalized feed struct from a RDF feed. func Parse(baseURL string, data io.Reader) (*model.Feed, error) { feed := new(rdfFeed) - if err := xml.NewDecoder(data).Decode(feed); err != nil { + if err := xml.NewXMLDecoder(data).Decode(feed); err != nil { return nil, fmt.Errorf("rdf: unable to parse feed: %w", err) } diff --git a/internal/reader/rss/parser.go b/internal/reader/rss/parser.go index 90a48750..3828e800 100644 --- a/internal/reader/rss/parser.go +++ b/internal/reader/rss/parser.go @@ -14,7 +14,7 @@ import ( // Parse returns a normalized feed struct from a RSS feed. func Parse(baseURL string, data io.Reader) (*model.Feed, error) { feed := new(rssFeed) - if err := xml.NewDecoder(data).Decode(feed); err != nil { + if err := xml.NewXMLDecoder(data).Decode(feed); err != nil { return nil, fmt.Errorf("rss: unable to parse feed: %w", err) } return feed.Transform(baseURL), nil diff --git a/internal/reader/scraper/scraper.go b/internal/reader/scraper/scraper.go index 8508761f..4aabff48 100644 --- a/internal/reader/scraper/scraper.go +++ b/internal/reader/scraper/scraper.go @@ -10,6 +10,7 @@ import ( "strings" "miniflux.app/v2/internal/config" + "miniflux.app/v2/internal/reader/encoding" "miniflux.app/v2/internal/reader/fetcher" "miniflux.app/v2/internal/reader/readability" "miniflux.app/v2/internal/urllib" @@ -41,17 +42,25 @@ func ScrapeWebsite(requestBuilder *fetcher.RequestBuilder, websiteURL, rules str var content string var err error + htmlDocumentReader, err := encoding.CharsetReaderFromContentType( + responseHandler.ContentType(), + responseHandler.Body(config.Opts.HTTPClientMaxBodySize()), + ) + if err != nil { + return "", fmt.Errorf("scraper: unable to read HTML document: %v", err) + } + if sameSite && rules != "" { slog.Debug("Extracting content with custom rules", "url", websiteURL, "rules", rules, ) - content, err = findContentUsingCustomRules(responseHandler.Body(config.Opts.HTTPClientMaxBodySize()), rules) + content, err = findContentUsingCustomRules(htmlDocumentReader, rules) } else { slog.Debug("Extracting content with readability", "url", websiteURL, ) - content, err = readability.ExtractContent(responseHandler.Body(config.Opts.HTTPClientMaxBodySize())) + content, err = readability.ExtractContent(htmlDocumentReader) } if err != nil { diff --git a/internal/reader/subscription/finder.go b/internal/reader/subscription/finder.go index 418e05e7..89b5f196 100644 --- a/internal/reader/subscription/finder.go +++ b/internal/reader/subscription/finder.go @@ -14,6 +14,7 @@ import ( "miniflux.app/v2/internal/integration/rssbridge" "miniflux.app/v2/internal/locale" "miniflux.app/v2/internal/model" + "miniflux.app/v2/internal/reader/encoding" "miniflux.app/v2/internal/reader/fetcher" "miniflux.app/v2/internal/reader/parser" "miniflux.app/v2/internal/urllib" @@ -98,8 +99,11 @@ func (f *SubscriptionFinder) FindSubscriptions(websiteURL, rssBridgeURL string) } // Step 4) Parse web page to find feeds from HTML meta tags. - slog.Debug("Try to detect feeds from HTML meta tags", slog.String("website_url", websiteURL)) - subscriptions, localizedError = f.FindSubscriptionsFromWebPage(websiteURL, bytes.NewReader(responseBody)) + slog.Debug("Try to detect feeds from HTML meta tags", + slog.String("website_url", websiteURL), + slog.String("content_type", responseHandler.ContentType()), + ) + subscriptions, localizedError = f.FindSubscriptionsFromWebPage(websiteURL, responseHandler.ContentType(), bytes.NewReader(responseBody)) if localizedError != nil { return nil, localizedError } @@ -138,7 +142,7 @@ func (f *SubscriptionFinder) FindSubscriptions(websiteURL, rssBridgeURL string) return nil, nil } -func (f *SubscriptionFinder) FindSubscriptionsFromWebPage(websiteURL string, body io.Reader) (Subscriptions, *locale.LocalizedErrorWrapper) { +func (f *SubscriptionFinder) FindSubscriptionsFromWebPage(websiteURL, contentType string, body io.Reader) (Subscriptions, *locale.LocalizedErrorWrapper) { queries := map[string]string{ "link[type='application/rss+xml']": parser.FormatRSS, "link[type='application/atom+xml']": parser.FormatAtom, @@ -146,7 +150,12 @@ func (f *SubscriptionFinder) FindSubscriptionsFromWebPage(websiteURL string, bod "link[type='application/feed+json']": parser.FormatJSON, } - doc, err := goquery.NewDocumentFromReader(body) + htmlDocumentReader, err := encoding.CharsetReaderFromContentType(contentType, body) + if err != nil { + return nil, locale.NewLocalizedErrorWrapper(err, "error.unable_to_parse_html_document", err) + } + + doc, err := goquery.NewDocumentFromReader(htmlDocumentReader) if err != nil { return nil, locale.NewLocalizedErrorWrapper(err, "error.unable_to_parse_html_document", err) } diff --git a/internal/reader/subscription/finder_test.go b/internal/reader/subscription/finder_test.go index 71f49281..160b52cb 100644 --- a/internal/reader/subscription/finder_test.go +++ b/internal/reader/subscription/finder_test.go @@ -40,7 +40,7 @@ func TestParseWebPageWithRssFeed(t *testing.T) { ` - subscriptions, err := NewSubscriptionFinder(nil).FindSubscriptionsFromWebPage("http://example.org/", strings.NewReader(htmlPage)) + subscriptions, err := NewSubscriptionFinder(nil).FindSubscriptionsFromWebPage("http://example.org/", "text/html", strings.NewReader(htmlPage)) if err != nil { t.Fatalf(`Parsing a correctly formatted HTML page should not return any error: %v`, err) } @@ -73,7 +73,7 @@ func TestParseWebPageWithAtomFeed(t *testing.T) { ` - subscriptions, err := NewSubscriptionFinder(nil).FindSubscriptionsFromWebPage("http://example.org/", strings.NewReader(htmlPage)) + subscriptions, err := NewSubscriptionFinder(nil).FindSubscriptionsFromWebPage("http://example.org/", "text/html", strings.NewReader(htmlPage)) if err != nil { t.Fatalf(`Parsing a correctly formatted HTML page should not return any error: %v`, err) } @@ -106,7 +106,7 @@ func TestParseWebPageWithJSONFeed(t *testing.T) { ` - subscriptions, err := NewSubscriptionFinder(nil).FindSubscriptionsFromWebPage("http://example.org/", strings.NewReader(htmlPage)) + subscriptions, err := NewSubscriptionFinder(nil).FindSubscriptionsFromWebPage("http://example.org/", "text/html", strings.NewReader(htmlPage)) if err != nil { t.Fatalf(`Parsing a correctly formatted HTML page should not return any error: %v`, err) } @@ -139,7 +139,7 @@ func TestParseWebPageWithOldJSONFeedMimeType(t *testing.T) { ` - subscriptions, err := NewSubscriptionFinder(nil).FindSubscriptionsFromWebPage("http://example.org/", strings.NewReader(htmlPage)) + subscriptions, err := NewSubscriptionFinder(nil).FindSubscriptionsFromWebPage("http://example.org/", "text/html", strings.NewReader(htmlPage)) if err != nil { t.Fatalf(`Parsing a correctly formatted HTML page should not return any error: %v`, err) } @@ -172,7 +172,7 @@ func TestParseWebPageWithRelativeFeedURL(t *testing.T) { ` - subscriptions, err := NewSubscriptionFinder(nil).FindSubscriptionsFromWebPage("http://example.org/", strings.NewReader(htmlPage)) + subscriptions, err := NewSubscriptionFinder(nil).FindSubscriptionsFromWebPage("http://example.org/", "text/html", strings.NewReader(htmlPage)) if err != nil { t.Fatalf(`Parsing a correctly formatted HTML page should not return any error: %v`, err) } @@ -205,7 +205,7 @@ func TestParseWebPageWithEmptyTitle(t *testing.T) { ` - subscriptions, err := NewSubscriptionFinder(nil).FindSubscriptionsFromWebPage("http://example.org/", strings.NewReader(htmlPage)) + subscriptions, err := NewSubscriptionFinder(nil).FindSubscriptionsFromWebPage("http://example.org/", "text/html", strings.NewReader(htmlPage)) if err != nil { t.Fatalf(`Parsing a correctly formatted HTML page should not return any error: %v`, err) } @@ -239,7 +239,7 @@ func TestParseWebPageWithMultipleFeeds(t *testing.T) { ` - subscriptions, err := NewSubscriptionFinder(nil).FindSubscriptionsFromWebPage("http://example.org/", strings.NewReader(htmlPage)) + subscriptions, err := NewSubscriptionFinder(nil).FindSubscriptionsFromWebPage("http://example.org/", "text/html", strings.NewReader(htmlPage)) if err != nil { t.Fatalf(`Parsing a correctly formatted HTML page should not return any error: %v`, err) } @@ -261,7 +261,7 @@ func TestParseWebPageWithDuplicatedFeeds(t *testing.T) { ` - subscriptions, err := NewSubscriptionFinder(nil).FindSubscriptionsFromWebPage("http://example.org/", strings.NewReader(htmlPage)) + subscriptions, err := NewSubscriptionFinder(nil).FindSubscriptionsFromWebPage("http://example.org/", "text/html", strings.NewReader(htmlPage)) if err != nil { t.Fatalf(`Parsing a correctly formatted HTML page should not return any error: %v`, err) } @@ -294,7 +294,7 @@ func TestParseWebPageWithEmptyFeedURL(t *testing.T) { ` - subscriptions, err := NewSubscriptionFinder(nil).FindSubscriptionsFromWebPage("http://example.org/", strings.NewReader(htmlPage)) + subscriptions, err := NewSubscriptionFinder(nil).FindSubscriptionsFromWebPage("http://example.org/", "text/html", strings.NewReader(htmlPage)) if err != nil { t.Fatalf(`Parsing a correctly formatted HTML page should not return any error: %v`, err) } @@ -315,7 +315,7 @@ func TestParseWebPageWithNoHref(t *testing.T) { ` - subscriptions, err := NewSubscriptionFinder(nil).FindSubscriptionsFromWebPage("http://example.org/", strings.NewReader(htmlPage)) + subscriptions, err := NewSubscriptionFinder(nil).FindSubscriptionsFromWebPage("http://example.org/", "text/html", strings.NewReader(htmlPage)) if err != nil { t.Fatalf(`Parsing a correctly formatted HTML page should not return any error: %v`, err) } diff --git a/internal/reader/xml/decoder.go b/internal/reader/xml/decoder.go index b8af6330..3b46cf78 100644 --- a/internal/reader/xml/decoder.go +++ b/internal/reader/xml/decoder.go @@ -13,8 +13,8 @@ import ( "miniflux.app/v2/internal/reader/encoding" ) -// NewDecoder returns a XML decoder that filters illegal characters. -func NewDecoder(data io.Reader) *xml.Decoder { +// NewXMLDecoder returns a XML decoder that filters illegal characters. +func NewXMLDecoder(data io.Reader) *xml.Decoder { var decoder *xml.Decoder buffer, _ := io.ReadAll(data) enc := procInst("encoding", string(buffer)) @@ -36,7 +36,7 @@ func NewDecoder(data io.Reader) *xml.Decoder { } rawData, err := io.ReadAll(utf8Reader) if err != nil { - return nil, fmt.Errorf("Unable to read data: %q", err) + return nil, fmt.Errorf("encoding: unable to read data: %w", err) } filteredBytes := bytes.Map(filterValidXMLChar, rawData) return bytes.NewReader(filteredBytes), nil diff --git a/internal/reader/xml/decoder_test.go b/internal/reader/xml/decoder_test.go index b8d59b5c..35bf9e64 100644 --- a/internal/reader/xml/decoder_test.go +++ b/internal/reader/xml/decoder_test.go @@ -10,7 +10,7 @@ import ( "testing" ) -func TestUTF8WithIllegalCharacters(t *testing.T) { +func TestXMLDocumentWithIllegalUnicodeCharacters(t *testing.T) { type myxml struct { XMLName xml.Name `xml:"rss"` Version string `xml:"version,attr"` @@ -23,7 +23,7 @@ func TestUTF8WithIllegalCharacters(t *testing.T) { var x myxml - decoder := NewDecoder(reader) + decoder := NewXMLDecoder(reader) err := decoder.Decode(&x) if err != nil { t.Error(err) @@ -34,7 +34,7 @@ func TestUTF8WithIllegalCharacters(t *testing.T) { } } -func TestWindows251WithIllegalCharacters(t *testing.T) { +func TestXMLDocumentWindows251EncodedWithIllegalCharacters(t *testing.T) { type myxml struct { XMLName xml.Name `xml:"rss"` Version string `xml:"version,attr"` @@ -47,7 +47,7 @@ func TestWindows251WithIllegalCharacters(t *testing.T) { var x myxml - decoder := NewDecoder(reader) + decoder := NewXMLDecoder(reader) err := decoder.Decode(&x) if err != nil { t.Error(err) @@ -58,7 +58,7 @@ func TestWindows251WithIllegalCharacters(t *testing.T) { } } -func TestIllegalEncodingField(t *testing.T) { +func TestXMLDocumentWithIncorrectEncodingField(t *testing.T) { type myxml struct { XMLName xml.Name `xml:"rss"` Version string `xml:"version,attr"` @@ -71,7 +71,7 @@ func TestIllegalEncodingField(t *testing.T) { var x myxml - decoder := NewDecoder(reader) + decoder := NewXMLDecoder(reader) err := decoder.Decode(&x) if err != nil { t.Error(err)