Significantly simplify/speed up the sanitizer

- Use constant time access for maps instead of iterating on them
- Build a ~large whitelist map inline instead of constructing it item by item
  (and remove a duplicate key/value pair)
- Use `slices` instead of hand-rolled loops
This commit is contained in:
jvoisin 2024-02-25 04:39:00 +01:00 committed by Frédéric Guillot
parent eae4cb1417
commit 54b5be5e7d
1 changed files with 87 additions and 126 deletions

View File

@ -8,6 +8,7 @@ import (
"fmt" "fmt"
"io" "io"
"regexp" "regexp"
"slices"
"strconv" "strconv"
"strings" "strings"
@ -183,24 +184,16 @@ func getExtraAttributes(tagName string) ([]string, []string) {
} }
func isValidTag(tagName string) bool { func isValidTag(tagName string) bool {
for element := range getTagAllowList() { if _, ok := getTagAllowList()[tagName]; ok {
if tagName == element { return true
return true
}
} }
return false return false
} }
func isValidAttribute(tagName, attributeName string) bool { func isValidAttribute(tagName, attributeName string) bool {
for element, attributes := range getTagAllowList() { if attributes, ok := getTagAllowList()[tagName]; ok {
if tagName == element { return inList(attributeName, attributes)
if inList(attributeName, attributes) {
return true
}
}
} }
return false return false
} }
@ -235,24 +228,21 @@ func isPixelTracker(tagName string, attributes []html.Attribute) bool {
} }
func hasRequiredAttributes(tagName string, attributes []string) bool { func hasRequiredAttributes(tagName string, attributes []string) bool {
elements := make(map[string][]string) elements := map[string][]string{
elements["a"] = []string{"href"} "a": {"href"},
elements["iframe"] = []string{"src"} "iframe": {"src"},
elements["img"] = []string{"src"} "img": {"src"},
elements["source"] = []string{"src", "srcset"} "source": {"src", "srcset"},
}
for element, attrs := range elements { if attrs, ok := elements[tagName]; ok {
if tagName == element { for _, attribute := range attributes {
for _, attribute := range attributes { if slices.Contains(attrs, attribute) {
for _, attr := range attrs { return true
if attr == attribute {
return true
}
}
} }
return false
} }
return false
} }
return true return true
@ -303,13 +293,9 @@ func hasValidURIScheme(src string) bool {
"hack://", // https://apps.apple.com/it/app/hack-for-hacker-news-reader/id1464477788?l=en-GB "hack://", // https://apps.apple.com/it/app/hack-for-hacker-news-reader/id1464477788?l=en-GB
} }
for _, prefix := range whitelist { return slices.ContainsFunc(whitelist, func(prefix string) bool {
if strings.HasPrefix(src, prefix) { return strings.HasPrefix(src, prefix)
return true })
}
}
return false
} }
func isBlockedResource(src string) bool { func isBlockedResource(src string) bool {
@ -322,13 +308,9 @@ func isBlockedResource(src string) bool {
"feeds.feedburner.com", "feeds.feedburner.com",
} }
for _, element := range blacklist { return slices.ContainsFunc(blacklist, func(element string) bool {
if strings.Contains(src, element) { return strings.Contains(src, element)
return true })
}
}
return false
} }
func isValidIframeSource(baseURL, src string) bool { func isValidIframeSource(baseURL, src string) bool {
@ -364,83 +346,72 @@ func isValidIframeSource(baseURL, src string) bool {
return true return true
} }
for _, prefix := range whitelist { return slices.ContainsFunc(whitelist, func(prefix string) bool {
if strings.HasPrefix(src, prefix) { return strings.HasPrefix(src, prefix)
return true })
}
}
return false
} }
func getTagAllowList() map[string][]string { func getTagAllowList() map[string][]string {
whitelist := make(map[string][]string) return map[string][]string{
whitelist["img"] = []string{"alt", "title", "src", "srcset", "sizes", "width", "height"} "a": {"href", "title", "id"},
whitelist["picture"] = []string{} "abbr": {"title"},
whitelist["audio"] = []string{"src"} "acronym": {"title"},
whitelist["video"] = []string{"poster", "height", "width", "src"} "audio": {"src"},
whitelist["source"] = []string{"src", "type", "srcset", "sizes", "media"} "blockquote": {},
whitelist["dt"] = []string{"id"} "br": {},
whitelist["dd"] = []string{"id"} "caption": {},
whitelist["dl"] = []string{"id"} "cite": {},
whitelist["table"] = []string{} "code": {},
whitelist["caption"] = []string{} "dd": {"id"},
whitelist["thead"] = []string{} "del": {},
whitelist["tfooter"] = []string{} "dfn": {},
whitelist["tr"] = []string{} "dl": {"id"},
whitelist["td"] = []string{"rowspan", "colspan"} "dt": {"id"},
whitelist["th"] = []string{"rowspan", "colspan"} "em": {},
whitelist["h1"] = []string{"id"} "figcaption": {},
whitelist["h2"] = []string{"id"} "figure": {},
whitelist["h3"] = []string{"id"} "h1": {"id"},
whitelist["h4"] = []string{"id"} "h2": {"id"},
whitelist["h5"] = []string{"id"} "h3": {"id"},
whitelist["h6"] = []string{"id"} "h4": {"id"},
whitelist["strong"] = []string{} "h5": {"id"},
whitelist["em"] = []string{} "h6": {"id"},
whitelist["code"] = []string{} "iframe": {"width", "height", "frameborder", "src", "allowfullscreen"},
whitelist["pre"] = []string{} "img": {"alt", "title", "src", "srcset", "sizes", "width", "height"},
whitelist["blockquote"] = []string{} "ins": {},
whitelist["q"] = []string{"cite"} "kbd": {},
whitelist["p"] = []string{} "li": {"id"},
whitelist["ul"] = []string{"id"} "ol": {"id"},
whitelist["li"] = []string{"id"} "p": {},
whitelist["ol"] = []string{"id"} "picture": {},
whitelist["br"] = []string{} "pre": {},
whitelist["del"] = []string{} "q": {"cite"},
whitelist["a"] = []string{"href", "title", "id"} "rp": {},
whitelist["figure"] = []string{} "rt": {},
whitelist["figcaption"] = []string{} "rtc": {},
whitelist["cite"] = []string{} "ruby": {},
whitelist["time"] = []string{"datetime"} "s": {},
whitelist["abbr"] = []string{"title"} "samp": {},
whitelist["acronym"] = []string{"title"} "source": {"src", "type", "srcset", "sizes", "media"},
whitelist["wbr"] = []string{} "strong": {},
whitelist["dfn"] = []string{} "sub": {},
whitelist["sub"] = []string{} "sup": {"id"},
whitelist["sup"] = []string{"id"} "table": {},
whitelist["var"] = []string{} "td": {"rowspan", "colspan"},
whitelist["samp"] = []string{} "tfooter": {},
whitelist["s"] = []string{} "th": {"rowspan", "colspan"},
whitelist["del"] = []string{} "thead": {},
whitelist["ins"] = []string{} "time": {"datetime"},
whitelist["kbd"] = []string{} "tr": {},
whitelist["rp"] = []string{} "ul": {"id"},
whitelist["rt"] = []string{} "var": {},
whitelist["rtc"] = []string{} "video": {"poster", "height", "width", "src"},
whitelist["ruby"] = []string{} "wbr": {},
whitelist["iframe"] = []string{"width", "height", "frameborder", "src", "allowfullscreen"} }
return whitelist
} }
func inList(needle string, haystack []string) bool { func inList(needle string, haystack []string) bool {
for _, element := range haystack { return slices.Contains(haystack, needle)
if element == needle {
return true
}
}
return false
} }
func rewriteIframeURL(link string) string { func rewriteIframeURL(link string) string {
@ -459,13 +430,7 @@ func isBlockedTag(tagName string) bool {
"style", "style",
} }
for _, element := range blacklist { return slices.Contains(blacklist, tagName)
if element == tagName {
return true
}
}
return false
} }
func sanitizeSrcsetAttr(baseURL, value string) string { func sanitizeSrcsetAttr(baseURL, value string) string {
@ -493,13 +458,9 @@ func isValidDataAttribute(value string) bool {
"data:image/gif", "data:image/gif",
"data:image/webp", "data:image/webp",
} }
return slices.ContainsFunc(dataAttributeAllowList, func(prefix string) bool {
for _, prefix := range dataAttributeAllowList { return strings.HasPrefix(value, prefix)
if strings.HasPrefix(value, prefix) { })
return true
}
}
return false
} }
func isAnchor(tagName string, attribute html.Attribute) bool { func isAnchor(tagName string, attribute html.Attribute) bool {