Improve a bit internal/reader/scraper/scraper.go

- make findContentUsingCustomRules' more idiomatic,
  since in golang a function returning an error might
  return garbage in other parameter. Moreover, ignoring
  errors is bad practise.
- getPredefinedScraperRules is now running in constant-time,
  instead of iterating on a list with around 50 items in it.
This commit is contained in:
jvoisin 2024-02-26 17:37:49 +01:00 committed by Frédéric Guillot
parent 5b2558bf92
commit c2d2f31438
2 changed files with 10 additions and 9 deletions

View File

@ -78,10 +78,9 @@ func findContentUsingCustomRules(page io.Reader, rules string) (string, error) {
contents := ""
document.Find(rules).Each(func(i int, s *goquery.Selection) {
var content string
content, _ = goquery.OuterHtml(s)
contents += content
if content, err := goquery.OuterHtml(s); err == nil {
contents += content
}
})
return contents, nil
@ -89,13 +88,11 @@ func findContentUsingCustomRules(page io.Reader, rules string) (string, error) {
func getPredefinedScraperRules(websiteURL string) string {
urlDomain := urllib.Domain(websiteURL)
urlDomain = strings.TrimPrefix(urlDomain, "www.")
for domain, rules := range predefinedRules {
if strings.Contains(urlDomain, domain) {
return rules
}
if rules, ok := predefinedRules[urlDomain]; ok {
return rules
}
return ""
}

View File

@ -19,6 +19,10 @@ func TestGetPredefinedRules(t *testing.T) {
t.Error("Unable to find rule for linux.com")
}
if getPredefinedScraperRules("https://linux.com/") == "" {
t.Error("Unable to find rule for linux.com")
}
if getPredefinedScraperRules("https://example.org/") != "" {
t.Error("A rule not defined should not return anything")
}