Basic table removal rule

This commit is contained in:
Jake Walker 2023-03-31 19:23:31 +01:00 committed by Frédéric Guillot
parent 9a826bbe6f
commit 49d2596fc6
3 changed files with 43 additions and 0 deletions

View File

@ -335,3 +335,34 @@ func parseMarkdown(entryContent string) string {
return sb.String()
}
func removeTables(entryContent string) string {
doc, err := goquery.NewDocumentFromReader(strings.NewReader(entryContent))
if err != nil {
return entryContent
}
var table *goquery.Selection
for {
table = doc.Find("table").First()
if table.Length() == 0 {
break
}
td := table.Find("td").First()
if td.Length() == 0 {
break
}
tdHtml, _ := td.Html()
table.Parent().AppendHtml(tdHtml)
table.Remove()
}
output, _ := doc.Find("body").First().Html()
return output
}

View File

@ -110,6 +110,8 @@ func applyRule(entryURL, entryContent string, rule rule) string {
}
case "parse_markdown":
entryContent = parseMarkdown(entryContent)
case "remove_tables":
entryContent = removeTables(entryContent)
}
return entryContent

View File

@ -325,3 +325,13 @@ func TestRewriteBase64DecodeArgs(t *testing.T) {
t.Errorf(`Not expected output: got "%s" instead of "%s"`, output, expected)
}
}
func TestRewriteRemoveTables(t *testing.T) {
content := `<table class="container"><tbody><tr><td><p>Test</p><table class="row"><tbody><tr><td>Hello World!</td></tr></tbody></table></td></tr></tbody></table>`
expected := `<p>Test</p>Hello World!`
output := Rewriter("https://example.org/article", content, `remove_tables`)
if expected != output {
t.Errorf(`Not expected output: got "%s" instead of "%s"`, output, expected)
}
}