From 806a06978536d976f3504ec0eed0c0e658a44df4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20Guillot?= Date: Mon, 4 Jul 2022 12:48:48 -0700 Subject: [PATCH] sanitizer: handle image URLs in srcset attribute with comma --- proxy/image_proxy.go | 48 +++++------------ proxy/image_proxy_test.go | 27 +++++++++- reader/sanitizer/sanitizer.go | 48 +++-------------- reader/sanitizer/sanitizer_test.go | 10 ---- reader/sanitizer/srcset.go | 82 ++++++++++++++++++++++++++++ reader/sanitizer/srcset_test.go | 85 ++++++++++++++++++++++++++++++ 6 files changed, 212 insertions(+), 88 deletions(-) create mode 100644 reader/sanitizer/srcset.go create mode 100644 reader/sanitizer/srcset_test.go diff --git a/proxy/image_proxy.go b/proxy/image_proxy.go index 07b087b9..e486654e 100644 --- a/proxy/image_proxy.go +++ b/proxy/image_proxy.go @@ -5,18 +5,16 @@ package proxy // import "miniflux.app/proxy" import ( - "regexp" "strings" "miniflux.app/config" + "miniflux.app/reader/sanitizer" "miniflux.app/url" "github.com/PuerkitoBio/goquery" "github.com/gorilla/mux" ) -var regexSplitSrcset = regexp.MustCompile(`,\s+`) - // ImageProxyRewriter replaces image URLs with internal proxy URLs. func ImageProxyRewriter(router *mux.Router, data string) string { proxyImages := config.Opts.ProxyImages() @@ -30,24 +28,20 @@ func ImageProxyRewriter(router *mux.Router, data string) string { } doc.Find("img").Each(func(i int, img *goquery.Selection) { - if srcAttr, ok := img.Attr("src"); ok { - if !isDataURL(srcAttr) && (proxyImages == "all" || !url.IsHTTPS(srcAttr)) { - img.SetAttr("src", ProxifyURL(router, srcAttr)) + if srcAttrValue, ok := img.Attr("src"); ok { + if !isDataURL(srcAttrValue) && (proxyImages == "all" || !url.IsHTTPS(srcAttrValue)) { + img.SetAttr("src", ProxifyURL(router, srcAttrValue)) } } - if srcsetAttr, ok := img.Attr("srcset"); ok { - if proxyImages == "all" || !url.IsHTTPS(srcsetAttr) { - proxifySourceSet(img, router, srcsetAttr) - } + if srcsetAttrValue, ok := img.Attr("srcset"); ok { + proxifySourceSet(img, router, proxyImages, srcsetAttrValue) } }) doc.Find("picture source").Each(func(i int, sourceElement *goquery.Selection) { - if srcsetAttr, ok := sourceElement.Attr("srcset"); ok { - if proxyImages == "all" || !url.IsHTTPS(srcsetAttr) { - proxifySourceSet(sourceElement, router, srcsetAttr) - } + if srcsetAttrValue, ok := sourceElement.Attr("srcset"); ok { + proxifySourceSet(sourceElement, router, proxyImages, srcsetAttrValue) } }) @@ -59,30 +53,16 @@ func ImageProxyRewriter(router *mux.Router, data string) string { return output } -func proxifySourceSet(element *goquery.Selection, router *mux.Router, attributeValue string) { - var proxifiedSources []string +func proxifySourceSet(element *goquery.Selection, router *mux.Router, proxyImages, srcsetAttrValue string) { + imageCandidates := sanitizer.ParseSrcSetAttribute(srcsetAttrValue) - for _, source := range regexSplitSrcset.Split(attributeValue, -1) { - parts := strings.Split(strings.TrimSpace(source), " ") - nbParts := len(parts) - - if nbParts > 0 { - rewrittenSource := parts[0] - if !isDataURL(rewrittenSource) { - rewrittenSource = ProxifyURL(router, rewrittenSource) - } - - if nbParts > 1 { - rewrittenSource += " " + parts[1] - } - - proxifiedSources = append(proxifiedSources, rewrittenSource) + for _, imageCandidate := range imageCandidates { + if !isDataURL(imageCandidate.ImageURL) && (proxyImages == "all" || !url.IsHTTPS(imageCandidate.ImageURL)) { + imageCandidate.ImageURL = ProxifyURL(router, imageCandidate.ImageURL) } } - if len(proxifiedSources) > 0 { - element.SetAttr("srcset", strings.Join(proxifiedSources, ", ")) - } + element.SetAttr("srcset", imageCandidates.String()) } func isDataURL(s string) bool { diff --git a/proxy/image_proxy_test.go b/proxy/image_proxy_test.go index 3b91c0fd..c658caef 100644 --- a/proxy/image_proxy_test.go +++ b/proxy/image_proxy_test.go @@ -234,8 +234,31 @@ func TestProxyFilterWithPictureSource(t *testing.T) { r := mux.NewRouter() r.HandleFunc("/proxy/{encodedURL}", func(w http.ResponseWriter, r *http.Request) {}).Name("proxy") - input := `` - expected := `` + input := `` + expected := `` + output := ImageProxyRewriter(r, input) + + if expected != output { + t.Errorf(`Not expected output: got %s`, output) + } +} + +func TestProxyFilterOnlyNonHTTPWithPictureSource(t *testing.T) { + os.Clearenv() + os.Setenv("PROXY_IMAGES", "https") + + var err error + parser := config.NewParser() + config.Opts, err = parser.ParseEnvironmentVariables() + if err != nil { + t.Fatalf(`Parsing failure: %v`, err) + } + + r := mux.NewRouter() + r.HandleFunc("/proxy/{encodedURL}", func(w http.ResponseWriter, r *http.Request) {}).Name("proxy") + + input := `` + expected := `` output := ImageProxyRewriter(r, input) if expected != output { diff --git a/reader/sanitizer/sanitizer.go b/reader/sanitizer/sanitizer.go index 92ae869e..f133247b 100644 --- a/reader/sanitizer/sanitizer.go +++ b/reader/sanitizer/sanitizer.go @@ -20,7 +20,6 @@ import ( var ( youtubeEmbedRegex = regexp.MustCompile(`//www\.youtube\.com/embed/(.*)`) - splitSrcsetRegex = regexp.MustCompile(`,\s?`) ) // Sanitize returns safe HTML. @@ -447,52 +446,17 @@ func isBlockedTag(tagName string) bool { return false } -/* - -One or more strings separated by commas, indicating possible image sources for the user agent to use. - -Each string is composed of: -- A URL to an image -- Optionally, whitespace followed by one of: -- A width descriptor (a positive integer directly followed by w). The width descriptor is divided by the source size given in the sizes attribute to calculate the effective pixel density. -- A pixel density descriptor (a positive floating point number directly followed by x). - -*/ func sanitizeSrcsetAttr(baseURL, value string) string { - var sanitizedSources []string - rawSources := splitSrcsetRegex.Split(value, -1) - for _, rawSource := range rawSources { - parts := strings.Split(strings.TrimSpace(rawSource), " ") - nbParts := len(parts) + imageCandidates := ParseSrcSetAttribute(value) - if nbParts > 0 { - sanitizedSource, err := url.AbsoluteURL(baseURL, parts[0]) - if err != nil { - continue - } - - if nbParts == 2 && isValidWidthOrDensityDescriptor(parts[1]) { - sanitizedSource += " " + parts[1] - } - - sanitizedSources = append(sanitizedSources, sanitizedSource) + for _, imageCandidate := range imageCandidates { + absoluteURL, err := url.AbsoluteURL(baseURL, imageCandidate.ImageURL) + if err == nil { + imageCandidate.ImageURL = absoluteURL } } - return strings.Join(sanitizedSources, ", ") -} -func isValidWidthOrDensityDescriptor(value string) bool { - if value == "" { - return false - } - - lastChar := value[len(value)-1:] - if lastChar != "w" && lastChar != "x" { - return false - } - - _, err := strconv.ParseFloat(value[0:len(value)-1], 32) - return err == nil + return imageCandidates.String() } func isValidDataAttribute(value string) bool { diff --git a/reader/sanitizer/sanitizer_test.go b/reader/sanitizer/sanitizer_test.go index aee7ba4e..695c1e49 100644 --- a/reader/sanitizer/sanitizer_test.go +++ b/reader/sanitizer/sanitizer_test.go @@ -85,16 +85,6 @@ func TestMediumImgWithSrcset(t *testing.T) { } } -func TestEconomistImgWithSrcset(t *testing.T) { - input := `` - expected := `` - output := Sanitize("http://example.org/", input) - - if output != expected { - t.Errorf(`Wrong output: %s`, output) - } -} - func TestSelfClosingTags(t *testing.T) { input := `

This
is a text
with an image: Test.

` output := Sanitize("http://example.org/", input) diff --git a/reader/sanitizer/srcset.go b/reader/sanitizer/srcset.go new file mode 100644 index 00000000..758d8d5c --- /dev/null +++ b/reader/sanitizer/srcset.go @@ -0,0 +1,82 @@ +// Copyright 2022 Frédéric Guillot. All rights reserved. +// Use of this source code is governed by the Apache 2.0 +// license that can be found in the LICENSE file. + +package sanitizer + +import ( + "fmt" + "strconv" + "strings" +) + +type ImageCandidate struct { + ImageURL string + Descriptor string +} + +type ImageCandidates []*ImageCandidate + +func (c ImageCandidates) String() string { + var htmlCandidates []string + + for _, imageCandidate := range c { + var htmlCandidate string + if imageCandidate.Descriptor != "" { + htmlCandidate = fmt.Sprintf(`%s %s`, imageCandidate.ImageURL, imageCandidate.Descriptor) + } else { + htmlCandidate = imageCandidate.ImageURL + } + + htmlCandidates = append(htmlCandidates, htmlCandidate) + } + + return strings.Join(htmlCandidates, ", ") +} + +// ParseSrcSetAttribute returns the list of image candidates from the set. +// https://html.spec.whatwg.org/#parse-a-srcset-attribute +func ParseSrcSetAttribute(attributeValue string) (imageCandidates ImageCandidates) { + unparsedCandidates := strings.Split(attributeValue, ", ") + + for _, unparsedCandidate := range unparsedCandidates { + if candidate, err := parseImageCandidate(unparsedCandidate); err == nil { + imageCandidates = append(imageCandidates, candidate) + } + } + + return imageCandidates +} + +func parseImageCandidate(input string) (*ImageCandidate, error) { + input = strings.TrimSpace(input) + parts := strings.Split(strings.TrimSpace(input), " ") + nbParts := len(parts) + + if nbParts > 2 || nbParts == 0 { + return nil, fmt.Errorf(`srcset: invalid number of descriptors`) + } + + if nbParts == 2 { + if !isValidWidthOrDensityDescriptor(parts[1]) { + return nil, fmt.Errorf(`srcset: invalid descriptor`) + } + return &ImageCandidate{ImageURL: parts[0], Descriptor: parts[1]}, nil + } + + return &ImageCandidate{ImageURL: parts[0]}, nil +} + +func isValidWidthOrDensityDescriptor(value string) bool { + if value == "" { + return false + } + + lastChar := value[len(value)-1:] + if lastChar != "w" && lastChar != "x" { + return false + } + + _, err := strconv.ParseFloat(value[0:len(value)-1], 32) + return err == nil +} diff --git a/reader/sanitizer/srcset_test.go b/reader/sanitizer/srcset_test.go new file mode 100644 index 00000000..4f19f9c3 --- /dev/null +++ b/reader/sanitizer/srcset_test.go @@ -0,0 +1,85 @@ +// Copyright 2022 Frédéric Guillot. All rights reserved. +// Use of this source code is governed by the Apache 2.0 +// license that can be found in the LICENSE file. + +package sanitizer + +import "testing" + +func TestParseSrcSetAttributeWithRelativeURLs(t *testing.T) { + input := `example-320w.jpg, example-480w.jpg 1.5x, example-640,w.jpg 2x, example-640w.jpg 640w` + candidates := ParseSrcSetAttribute(input) + + if len(candidates) != 4 { + t.Error(`Incorrect number of candidates`) + } + + if candidates.String() != `example-320w.jpg, example-480w.jpg 1.5x, example-640,w.jpg 2x, example-640w.jpg 640w` { + t.Errorf(`Unexpected string output`) + } +} + +func TestParseSrcSetAttributeWithAbsoluteURLs(t *testing.T) { + input := `http://example.org/example-320w.jpg 320w, http://example.org/example-480w.jpg 1.5x` + candidates := ParseSrcSetAttribute(input) + + if len(candidates) != 2 { + t.Error(`Incorrect number of candidates`) + } + + if candidates.String() != `http://example.org/example-320w.jpg 320w, http://example.org/example-480w.jpg 1.5x` { + t.Errorf(`Unexpected string output`) + } +} + +func TestParseSrcSetAttributeWithOneCandidate(t *testing.T) { + input := `http://example.org/example-320w.jpg` + candidates := ParseSrcSetAttribute(input) + + if len(candidates) != 1 { + t.Error(`Incorrect number of candidates`) + } + + if candidates.String() != `http://example.org/example-320w.jpg` { + t.Errorf(`Unexpected string output`) + } +} + +func TestParseSrcSetAttributeWithCommaURL(t *testing.T) { + input := `http://example.org/example,a:b/d.jpg , example-480w.jpg 1.5x` + candidates := ParseSrcSetAttribute(input) + + if len(candidates) != 2 { + t.Error(`Incorrect number of candidates`) + } + + if candidates.String() != `http://example.org/example,a:b/d.jpg, example-480w.jpg 1.5x` { + t.Errorf(`Unexpected string output`) + } +} + +func TestParseSrcSetAttributeWithIncorrectDescriptor(t *testing.T) { + input := `http://example.org/example-320w.jpg test` + candidates := ParseSrcSetAttribute(input) + + if len(candidates) != 0 { + t.Error(`Incorrect number of candidates`) + } + + if candidates.String() != `` { + t.Errorf(`Unexpected string output`) + } +} + +func TestParseSrcSetAttributeWithTooManyDescriptors(t *testing.T) { + input := `http://example.org/example-320w.jpg 10w 1x` + candidates := ParseSrcSetAttribute(input) + + if len(candidates) != 0 { + t.Error(`Incorrect number of candidates`) + } + + if candidates.String() != `` { + t.Errorf(`Unexpected string output`) + } +}