Add rewrite rule to fix Medium.com images

This commit is contained in:
Frédéric Guillot 2020-09-29 22:22:25 -07:00 committed by Frédéric Guillot
parent d75ff0c5ab
commit 31435ef83e
6 changed files with 89 additions and 40 deletions

View File

@ -76,7 +76,7 @@ func ExtractContent(page io.Reader) (string, error) {
return "", err
}
document.Find("script,style,noscript").Each(func(i int, s *goquery.Selection) {
document.Find("script,style").Each(func(i int, s *goquery.Selection) {
removeNodes(s)
})

View File

@ -139,6 +139,21 @@ func addDynamicImage(entryURL, entryContent string) string {
return entryContent
}
func fixMediumImages(entryURL, entryContent string) string {
doc, err := goquery.NewDocumentFromReader(strings.NewReader(entryContent))
if err != nil {
return entryContent
}
doc.Find("figure.paragraph-image").Each(func(i int, paragraphImage *goquery.Selection) {
noscriptElement := paragraphImage.Find("noscript")
paragraphImage.ReplaceWithHtml(noscriptElement.Text())
})
output, _ := doc.Find("body").First().Html()
return output
}
func addYoutubeVideo(entryURL, entryContent string) string {
matches := youtubeRegex.FindStringSubmatch(entryURL)

View File

@ -43,6 +43,8 @@ func Rewriter(entryURL, entryContent, customRewriteRules string) string {
entryContent = replaceLineFeeds(entryContent)
case "convert_text_link", "convert_text_links":
entryContent = replaceTextLinks(entryContent)
case "fix_medium_images":
entryContent = fixMediumImages(entryURL, entryContent)
}
}

View File

@ -4,7 +4,10 @@
package rewrite // import "miniflux.app/reader/rewrite"
import "testing"
import (
"strings"
"testing"
)
func TestReplaceTextLinks(t *testing.T) {
scenarios := map[string]string{
@ -176,3 +179,32 @@ func TestConvertTextLinkRewriteRule(t *testing.T) {
t.Errorf(`Not expected output: got %q instead of %q`, output, expected)
}
}
func TestMediumImage(t *testing.T) {
content := `
<figure class="ht hu hv hw hx hy cy cz paragraph-image">
<div class="hz ia ib ic aj">
<div class="cy cz hs">
<div class="ii s ib ij">
<div class="ik il s">
<div class="id ie t u v if aj bk ig ih">
<img alt="Image for post" class="t u v if aj im in io" src="https://miro.medium.com/max/60/1*ephLSqSzQYLvb7faDwzRbw.jpeg?q=20" width="1280" height="720"/>
</div>
<img alt="Image for post" class="id ie t u v if aj c" width="1280" height="720"/>
<noscript>
<img alt="Image for post" class="t u v if aj" src="https://miro.medium.com/max/2560/1*ephLSqSzQYLvb7faDwzRbw.jpeg" width="1280" height="720" srcSet="https://miro.medium.com/max/552/1*ephLSqSzQYLvb7faDwzRbw.jpeg 276w, https://miro.medium.com/max/1104/1*ephLSqSzQYLvb7faDwzRbw.jpeg 552w, https://miro.medium.com/max/1280/1*ephLSqSzQYLvb7faDwzRbw.jpeg 640w, https://miro.medium.com/max/1400/1*ephLSqSzQYLvb7faDwzRbw.jpeg 700w" sizes="700px"/>
</noscript>
</div>
</div>
</div>
</div>
</figure>
`
expected := `<img alt="Image for post" class="t u v if aj" src="https://miro.medium.com/max/2560/1*ephLSqSzQYLvb7faDwzRbw.jpeg" width="1280" height="720" srcset="https://miro.medium.com/max/552/1*ephLSqSzQYLvb7faDwzRbw.jpeg 276w, https://miro.medium.com/max/1104/1*ephLSqSzQYLvb7faDwzRbw.jpeg 552w, https://miro.medium.com/max/1280/1*ephLSqSzQYLvb7faDwzRbw.jpeg 640w, https://miro.medium.com/max/1400/1*ephLSqSzQYLvb7faDwzRbw.jpeg 700w" sizes="700px"/>`
output := Rewriter("https://example.org/article", content, "fix_medium_images")
output = strings.TrimSpace(output)
if expected != output {
t.Errorf(`Not expected output: %s`, output)
}
}

View File

@ -30,4 +30,5 @@ var predefinedRules = map[string]string{
"invidio.us": "add_invidious_video",
"xkcd.com": "add_image_title",
"framatube.org": "nl2br,convert_text_link",
"medium.com": "fix_medium_images",
}

View File

@ -7,43 +7,42 @@ package scraper // import "miniflux.app/reader/scraper"
// List of predefined scraper rules (alphabetically sorted)
// domain => CSS selectors
var predefinedRules = map[string]string{
"bbc.co.uk": "div.vxp-column--single, div.story-body__inner, ul.gallery-images__list",
"cbc.ca": ".story-content",
"darkreading.com": "#article-main:not(header)",
"developpez.com": "div[itemprop=articleBody]",
"dilbert.com": "span.comic-title-name, img.img-comic",
"bbc.co.uk": "div.vxp-column--single, div.story-body__inner, ul.gallery-images__list",
"cbc.ca": ".story-content",
"darkreading.com": "#article-main:not(header)",
"developpez.com": "div[itemprop=articleBody]",
"dilbert.com": "span.comic-title-name, img.img-comic",
"financialsamurai.com": "article",
"francetvinfo.fr": ".text",
"github.com": "article.entry-content",
"heise.de": "header .article-content__lead, header .article-image, div.article-layout__content.article-content",
"igen.fr": "section.corps",
"ing.dk": "section.body",
"lapresse.ca": ".amorce, .entry",
"lemonde.fr": "article",
"lepoint.fr": ".art-text",
"lesjoiesducode.fr": ".blog-post-content img",
"lesnumeriques.com": ".text",
"linux.com": "div.content, div[property]",
"medium.com": ".section-content",
"mac4ever.com": "div[itemprop=articleBody]",
"monwindows.com": ".blog-post-body",
"npr.org": "#storytext",
"oneindia.com": ".io-article-body",
"opensource.com": "div[property]",
"osnews.com": "div.newscontent1",
"phoronix.com": "div.content",
"pseudo-sciences.org": "#art_main",
"raywenderlich.com": "article",
"slate.fr": ".field-items",
"techcrunch.com": "div.article-entry",
"theoatmeal.com": "div#comic",
"theregister.co.uk": "#body",
"turnoff.us": "article.post-content",
"universfreebox.com": "#corps_corps",
"version2.dk": "section.body",
"wdwnt.com": "div.entry-content",
"wired.com": "main figure, article",
"zeit.de": ".summary, .article-body",
"zdnet.com": "div.storyBody",
"openingsource.org": "article.suxing-popup-gallery",
"francetvinfo.fr": ".text",
"github.com": "article.entry-content",
"heise.de": "header .article-content__lead, header .article-image, div.article-layout__content.article-content",
"igen.fr": "section.corps",
"ing.dk": "section.body",
"lapresse.ca": ".amorce, .entry",
"lemonde.fr": "article",
"lepoint.fr": ".art-text",
"lesjoiesducode.fr": ".blog-post-content img",
"lesnumeriques.com": ".text",
"linux.com": "div.content, div[property]",
"mac4ever.com": "div[itemprop=articleBody]",
"monwindows.com": ".blog-post-body",
"npr.org": "#storytext",
"oneindia.com": ".io-article-body",
"opensource.com": "div[property]",
"osnews.com": "div.newscontent1",
"phoronix.com": "div.content",
"pseudo-sciences.org": "#art_main",
"raywenderlich.com": "article",
"slate.fr": ".field-items",
"techcrunch.com": "div.article-entry",
"theoatmeal.com": "div#comic",
"theregister.co.uk": "#body",
"turnoff.us": "article.post-content",
"universfreebox.com": "#corps_corps",
"version2.dk": "section.body",
"wdwnt.com": "div.entry-content",
"wired.com": "main figure, article",
"zeit.de": ".summary, .article-body",
"zdnet.com": "div.storyBody",
"openingsource.org": "article.suxing-popup-gallery",
}