diff --git a/internal/reader/handler/handler.go b/internal/reader/handler/handler.go index 91b7e394..9ab454e3 100644 --- a/internal/reader/handler/handler.go +++ b/internal/reader/handler/handler.go @@ -241,8 +241,8 @@ func RefreshFeed(store *storage.Storage, userID, feedID int64, forceRefresh bool func checkFeedIcon(store *storage.Storage, feedID int64, websiteURL, feedIconURL, userAgent string, fetchViaProxy, allowSelfSignedCertificates bool) { if !store.HasIcon(feedID) { - icon, err := icon.FindIcon(websiteURL, feedIconURL, userAgent, fetchViaProxy, allowSelfSignedCertificates) - if err != nil { + iconFinder := icon.NewIconFinder(websiteURL, feedIconURL, userAgent, fetchViaProxy, allowSelfSignedCertificates) + if icon, err := iconFinder.FindIcon(); err != nil { slog.Warn("Unable to find feed icon", slog.Int64("feed_id", feedID), slog.String("website_url", websiteURL), diff --git a/internal/reader/icon/finder.go b/internal/reader/icon/finder.go index 343e80f4..33e73b51 100644 --- a/internal/reader/icon/finder.go +++ b/internal/reader/icon/finder.go @@ -7,6 +7,7 @@ import ( "encoding/base64" "fmt" "io" + "log/slog" "net/url" "strings" @@ -19,106 +20,170 @@ import ( "github.com/PuerkitoBio/goquery" ) -// FindIcon try to find the website's icon. -func FindIcon(websiteURL, feedIconURL, userAgent string, fetchViaProxy, allowSelfSignedCertificates bool) (icon *model.Icon, err error) { - if feedIconURL == "" { - feedIconURL, err = fetchHTMLDocumentAndFindIconURL(websiteURL, userAgent, fetchViaProxy, allowSelfSignedCertificates) - if err != nil { - return nil, err +type IconFinder struct { + websiteURL string + feedIconURL string + userAgent string + fetchViaProxy bool + allowSelfSignedCertificates bool +} + +func NewIconFinder(websiteURL, feedIconURL, userAgent string, fetchViaProxy, allowSelfSignedCertificates bool) *IconFinder { + return &IconFinder{ + websiteURL: websiteURL, + feedIconURL: feedIconURL, + userAgent: userAgent, + fetchViaProxy: fetchViaProxy, + allowSelfSignedCertificates: allowSelfSignedCertificates, + } +} + +func (f *IconFinder) FindIcon() (*model.Icon, error) { + slog.Debug("Begin icon discovery process", + slog.String("website_url", f.websiteURL), + slog.String("feed_icon_url", f.feedIconURL), + ) + + if f.feedIconURL != "" { + if icon, err := f.FetchFeedIcon(); err != nil { + slog.Debug("Unable to download icon from feed", + slog.String("website_url", f.websiteURL), + slog.String("feed_icon_url", f.feedIconURL), + slog.Any("error", err), + ) + } else if icon != nil { + return icon, nil } } - if strings.HasPrefix(feedIconURL, "data:") { - return parseImageDataURL(feedIconURL) + if icon, err := f.FetchIconsFromHTMLDocument(); err != nil { + slog.Debug("Unable to fetch icons from HTML document", + slog.String("website_url", f.websiteURL), + slog.Any("error", err), + ) + } else if icon != nil { + return icon, nil } - feedIconURL, err = generateIconURL(websiteURL, feedIconURL) + return f.FetchDefaultIcon() +} + +func (f *IconFinder) FetchDefaultIcon() (*model.Icon, error) { + slog.Debug("Fetching default icon", + slog.String("website_url", f.websiteURL), + ) + + iconURL, err := urllib.JoinBaseURLAndPath(urllib.RootURL(f.websiteURL), "favicon.ico") if err != nil { - return nil, err + return nil, fmt.Errorf(`icon: unable to join root URL and path: %w`, err) } - if icon, err = downloadIcon(feedIconURL, userAgent, fetchViaProxy, allowSelfSignedCertificates); err != nil { + icon, err := f.DownloadIcon(iconURL) + if err != nil { return nil, err } return icon, nil } -func generateIconURL(websiteURL, feedIconURL string) (iconURL string, err error) { - feedIconURL = strings.TrimSpace(feedIconURL) +func (f *IconFinder) FetchFeedIcon() (*model.Icon, error) { + slog.Debug("Fetching feed icon", + slog.String("website_url", f.websiteURL), + slog.String("feed_icon_url", f.feedIconURL), + ) - if feedIconURL == "" { - iconURL, err = urllib.JoinBaseURLAndPath(urllib.RootURL(websiteURL), "favicon.ico") - if err != nil { - return "", fmt.Errorf(`icon: unable to join base URL and path: %w`, err) + iconURL, err := urllib.AbsoluteURL(f.websiteURL, f.feedIconURL) + if err != nil { + return nil, fmt.Errorf(`icon: unable to convert icon URL to absolute URL: %w`, err) + } + + return f.DownloadIcon(iconURL) +} + +func (f *IconFinder) FetchIconsFromHTMLDocument() (*model.Icon, error) { + slog.Debug("Searching icons from HTML document", + slog.String("website_url", f.websiteURL), + ) + + documentBody, err := f.FetchRootDocument() + if err != nil { + return nil, err + } + + iconURLs, err := findIconURLsFromHTMLDocument(documentBody) + if err != nil { + return nil, err + } + + slog.Debug("Searched icon from HTML document", + slog.String("website_url", f.websiteURL), + slog.String("icon_urls", strings.Join(iconURLs, ",")), + ) + + for _, iconURL := range iconURLs { + if strings.HasPrefix(iconURL, "data:") { + slog.Debug("Found icon with data URL", + slog.String("website_url", f.websiteURL), + ) + return parseImageDataURL(iconURL) } - } else { - iconURL, err = urllib.AbsoluteURL(websiteURL, feedIconURL) + + iconURL, err = urllib.AbsoluteURL(f.websiteURL, iconURL) if err != nil { - return "", fmt.Errorf(`icon: unable to convert icon URL to absolute URL: %w`, err) + return nil, fmt.Errorf(`icon: unable to convert icon URL to absolute URL: %w`, err) + } + + if icon, err := f.DownloadIcon(iconURL); err != nil { + slog.Debug("Unable to download icon from HTML document", + slog.String("website_url", f.websiteURL), + slog.String("icon_url", iconURL), + slog.Any("error", err), + ) + } else if icon != nil { + slog.Debug("Found icon from HTML document", + slog.String("website_url", f.websiteURL), + slog.String("icon_url", iconURL), + ) + return icon, nil } } - return iconURL, nil + return nil, nil } -func fetchHTMLDocumentAndFindIconURL(websiteURL, userAgent string, fetchViaProxy, allowSelfSignedCertificates bool) (string, error) { - rootURL := urllib.RootURL(websiteURL) +func (f *IconFinder) FetchRootDocument() (io.Reader, error) { + rootURL := urllib.RootURL(f.websiteURL) clt := client.NewClientWithConfig(rootURL, config.Opts) - clt.WithUserAgent(userAgent) - clt.AllowSelfSignedCertificates = allowSelfSignedCertificates + clt.WithUserAgent(f.userAgent) + clt.AllowSelfSignedCertificates = f.allowSelfSignedCertificates - if fetchViaProxy { + if f.fetchViaProxy { clt.WithProxy() } response, err := clt.Get() if err != nil { - return "", fmt.Errorf("icon: unable to download website index page: %v", err) + return nil, fmt.Errorf("icon: unable to download website index page: %v", err) } if response.HasServerFailure() { - return "", fmt.Errorf("icon: unable to download website index page: status=%d", response.StatusCode) + return nil, fmt.Errorf("icon: unable to download website index page: status=%d", response.StatusCode) } - return findIconURLFromHTMLDocument(response.Body) + return response.Body, nil } -func findIconURLFromHTMLDocument(body io.Reader) (string, error) { - queries := []string{ - "link[rel='shortcut icon']", - "link[rel='Shortcut Icon']", - "link[rel='icon shortcut']", - "link[rel='icon']", - } +func (f *IconFinder) DownloadIcon(iconURL string) (*model.Icon, error) { + slog.Debug("Downloading icon", + slog.String("website_url", f.websiteURL), + slog.String("icon_url", iconURL), + ) - doc, err := goquery.NewDocumentFromReader(body) - if err != nil { - return "", fmt.Errorf("icon: unable to read document: %v", err) - } - - var iconURL string - for _, query := range queries { - doc.Find(query).Each(func(i int, s *goquery.Selection) { - if href, exists := s.Attr("href"); exists { - iconURL = strings.TrimSpace(href) - } - }) - - if iconURL != "" { - break - } - } - - return iconURL, nil -} - -func downloadIcon(iconURL, userAgent string, fetchViaProxy, allowSelfSignedCertificates bool) (*model.Icon, error) { clt := client.NewClientWithConfig(iconURL, config.Opts) - clt.WithUserAgent(userAgent) - clt.AllowSelfSignedCertificates = allowSelfSignedCertificates - if fetchViaProxy { + clt.WithUserAgent(f.userAgent) + clt.AllowSelfSignedCertificates = f.allowSelfSignedCertificates + if f.fetchViaProxy { clt.WithProxy() } @@ -149,6 +214,43 @@ func downloadIcon(iconURL, userAgent string, fetchViaProxy, allowSelfSignedCerti return icon, nil } +func findIconURLsFromHTMLDocument(body io.Reader) ([]string, error) { + queries := []string{ + "link[rel='shortcut icon']", + "link[rel='Shortcut Icon']", + "link[rel='icon shortcut']", + "link[rel='icon']", + } + + doc, err := goquery.NewDocumentFromReader(body) + if err != nil { + return nil, fmt.Errorf("icon: unable to read document: %v", err) + } + + var iconURLs []string + for _, query := range queries { + slog.Debug("Searching icon URL in HTML document", slog.String("query", query)) + + doc.Find(query).Each(func(i int, s *goquery.Selection) { + var iconURL string + + if href, exists := s.Attr("href"); exists { + iconURL = strings.TrimSpace(href) + } + + if iconURL != "" { + iconURLs = append(iconURLs, iconURL) + + slog.Debug("Found icon URL in HTML document", + slog.String("query", query), + slog.String("icon_url", iconURL)) + } + }) + } + + return iconURLs, nil +} + // https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URIs#syntax // data:[][;base64], func parseImageDataURL(value string) (*model.Icon, error) { diff --git a/internal/reader/icon/finder_test.go b/internal/reader/icon/finder_test.go index 2ea297c1..a6f30738 100644 --- a/internal/reader/icon/finder_test.go +++ b/internal/reader/icon/finder_test.go @@ -112,59 +112,16 @@ func TestParseDocumentWithWhitespaceIconURL(t *testing.T) { /static/img/favicon.ico ">` - iconURL, err := findIconURLFromHTMLDocument(strings.NewReader(html)) + iconURLs, err := findIconURLsFromHTMLDocument(strings.NewReader(html)) if err != nil { t.Fatal(err) } - if iconURL != "/static/img/favicon.ico" { - t.Errorf(`Invalid icon URL, got %q`, iconURL) - } -} - -func TestGenerateIconURL(t *testing.T) { - iconURL, err := generateIconURL("https://example.org/", "/favicon.png") - if err != nil { - t.Fatal(err) - } - - if iconURL != "https://example.org/favicon.png" { - t.Errorf(`Invalid icon URL, got %q`, iconURL) - } - - iconURL, err = generateIconURL("https://example.org/", "img/favicon.png") - if err != nil { - t.Fatal(err) - } - - if iconURL != "https://example.org/img/favicon.png" { - t.Errorf(`Invalid icon URL, got %q`, iconURL) - } - - iconURL, err = generateIconURL("https://example.org/", "https://example.org/img/favicon.png") - if err != nil { - t.Fatal(err) - } - - if iconURL != "https://example.org/img/favicon.png" { - t.Errorf(`Invalid icon URL, got %q`, iconURL) - } - - iconURL, err = generateIconURL("https://example.org/", "//example.org/img/favicon.png") - if err != nil { - t.Fatal(err) - } - - if iconURL != "https://example.org/img/favicon.png" { - t.Errorf(`Invalid icon URL, got %q`, iconURL) - } - - iconURL, err = generateIconURL("https://example.org/", " ") - if err != nil { - t.Fatal(err) - } - - if iconURL != "https://example.org/favicon.ico" { - t.Errorf(`Invalid icon URL, got %q`, iconURL) + if len(iconURLs) != 1 { + t.Fatalf(`Invalid number of icon URLs, got %d`, len(iconURLs)) + } + + if iconURLs[0] != "/static/img/favicon.ico" { + t.Errorf(`Invalid icon URL, got %q`, iconURLs[0]) } }