From e54825bf02427a19615d094711dd53985e92ccde Mon Sep 17 00:00:00 2001 From: Ztec Date: Thu, 13 Jun 2024 16:30:15 +0200 Subject: [PATCH] Improve YouTube page feed detection In order to be more resilient to YouTube URLs variation and to address this feature_request: https://github.com/miniflux/v2/issues/2628 I've reworked a bit the way the YouTube feed extraction is done. I've kept all the `FindSubscriptionsFromYouTube*` in order to keep all the existing unit tests as-is ensuring little to no regressions. By doing so, I had to call twice `youtubeURLIDExtractor`. Small performance penalty for peace of mind in my opinion. `youtubeURLIDExtractor` is made in a way only one kind of page can be detected at a time. This mean I can solve the "video in a playlist" feature_request by prioritizing the playlist ID over the Video ID Also, by using `url.Parse()` to get ids, it's safer to url mangle and variation. The most common variation being the `t=42` parameters that start the playback at a given position. Previously, this kind of url would not be detected as "YouTube URL". I deliberately ignored the url parsing error to keep previous behavior (skip the YouTube analysis and follow with the other analysis) I also tried to keep debug logs the same as before as much as I could. I manually tested all the YouTube cases (video,channel,playlist) and they all work as expected except for the video. But this one does not work either on main. The `meta` html tag that was searched for does not seem to exist anymore. fix: #2628 --- internal/reader/subscription/finder.go | 144 +++++++++++++------- internal/reader/subscription/finder_test.go | 71 +++++++++- 2 files changed, 165 insertions(+), 50 deletions(-) diff --git a/internal/reader/subscription/finder.go b/internal/reader/subscription/finder.go index 1b359a82..085467fd 100644 --- a/internal/reader/subscription/finder.go +++ b/internal/reader/subscription/finder.go @@ -5,10 +5,13 @@ package subscription // import "miniflux.app/v2/internal/reader/subscription" import ( "bytes" + "errors" "fmt" "io" "log/slog" + "net/url" "regexp" + "strings" "miniflux.app/v2/internal/config" "miniflux.app/v2/internal/integration/rssbridge" @@ -22,10 +25,19 @@ import ( "golang.org/x/net/html/charset" ) +type youtubeKind string + +const ( + youtubeIDKindChannel youtubeKind = "channel" + youtubeIDKindVideo youtubeKind = "video" + youtubeIDKindPlaylist youtubeKind = "playlist" +) + var ( - youtubeChannelRegex = regexp.MustCompile(`youtube\.com/channel/(.*)$`) - youtubeVideoRegex = regexp.MustCompile(`youtube\.com/watch\?v=(.*)$`) - youtubePlaylistRegex = regexp.MustCompile(`youtube\.com/playlist\?list=(.*)$`) + youtubeHostRegex = regexp.MustCompile(`youtube\.com$`) + youtubeChannelRegex = regexp.MustCompile(`channel/(.*)$`) + + errNotYoutubeUrl = fmt.Errorf("this website is not a YouTube page") ) type SubscriptionFinder struct { @@ -75,43 +87,40 @@ func (f *SubscriptionFinder) FindSubscriptions(websiteURL, rssBridgeURL string) return Subscriptions{NewSubscription(responseHandler.EffectiveURL(), responseHandler.EffectiveURL(), feedFormat)}, nil } - // Step 2) Check if the website URL is a YouTube channel. - slog.Debug("Try to detect feeds from YouTube channel page", slog.String("website_url", websiteURL)) - subscriptions, localizedError := f.FindSubscriptionsFromYouTubeChannelPage(websiteURL) - if localizedError != nil { - return nil, localizedError + subscriptions := make(Subscriptions, 1) + + // Step 2) Parse URL to find feeds from YouTube. + kind, _, err := youtubeURLIDExtractor(websiteURL) + + // If YouTube url has been detected, return the YouTube feed + if err == nil || !errors.Is(err, errNotYoutubeUrl) { + switch kind { + case youtubeIDKindChannel: + slog.Debug("Try to detect feeds from YouTube channel page", slog.String("website_url", websiteURL)) + subscriptions, localizedError = f.FindSubscriptionsFromYouTubeChannelPage(websiteURL) + if localizedError != nil { + return nil, localizedError + } + case youtubeIDKindVideo: + slog.Debug("Try to detect feeds from YouTube video page", slog.String("website_url", websiteURL)) + subscriptions, localizedError = f.FindSubscriptionsFromYouTubeVideoPage(websiteURL) + if localizedError != nil { + return nil, localizedError + } + case youtubeIDKindPlaylist: + slog.Debug("Try to detect feeds from YouTube playlist page", slog.String("website_url", websiteURL)) + subscriptions, localizedError = f.FindSubscriptionsFromYouTubePlaylistPage(websiteURL) + if localizedError != nil { + return nil, localizedError + } + } + if len(subscriptions) > 0 { + slog.Debug("Subscriptions found from YouTube page", slog.String("website_url", websiteURL), slog.Any("subscriptions", subscriptions)) + return subscriptions, nil + } } - if len(subscriptions) > 0 { - slog.Debug("Subscriptions found from YouTube channel page", slog.String("website_url", websiteURL), slog.Any("subscriptions", subscriptions)) - return subscriptions, nil - } - - // Step 3) Check if the website URL is a YouTube video. - slog.Debug("Try to detect feeds from YouTube video page", slog.String("website_url", websiteURL)) - subscriptions, localizedError = f.FindSubscriptionsFromYouTubeVideoPage(websiteURL) - if localizedError != nil { - return nil, localizedError - } - - if len(subscriptions) > 0 { - slog.Debug("Subscriptions found from YouTube video page", slog.String("website_url", websiteURL), slog.Any("subscriptions", subscriptions)) - return subscriptions, nil - } - - // Step 4) Check if the website URL is a YouTube playlist. - slog.Debug("Try to detect feeds from YouTube playlist page", slog.String("website_url", websiteURL)) - subscriptions, localizedError = f.FindSubscriptionsFromYouTubePlaylistPage(websiteURL) - if localizedError != nil { - return nil, localizedError - } - - if len(subscriptions) > 0 { - slog.Debug("Subscriptions found from YouTube playlist page", slog.String("website_url", websiteURL), slog.Any("subscriptions", subscriptions)) - return subscriptions, nil - } - - // Step 5) Parse web page to find feeds from HTML meta tags. + // Step 3) Parse web page to find feeds from HTML meta tags. slog.Debug("Try to detect feeds from HTML meta tags", slog.String("website_url", websiteURL), slog.String("content_type", responseHandler.ContentType()), @@ -126,7 +135,7 @@ func (f *SubscriptionFinder) FindSubscriptions(websiteURL, rssBridgeURL string) return subscriptions, nil } - // Step 6) Check if the website URL can use RSS-Bridge. + // Step 4) Check if the website URL can use RSS-Bridge. if rssBridgeURL != "" { slog.Debug("Try to detect feeds with RSS-Bridge", slog.String("website_url", websiteURL)) subscriptions, localizedError := f.FindSubscriptionsFromRSSBridge(websiteURL, rssBridgeURL) @@ -140,7 +149,7 @@ func (f *SubscriptionFinder) FindSubscriptions(websiteURL, rssBridgeURL string) } } - // Step 7) Check if the website has a known feed URL. + // Step 5) Check if the website has a known feed URL. slog.Debug("Try to detect feeds from well-known URLs", slog.String("website_url", websiteURL)) subscriptions, localizedError = f.FindSubscriptionsFromWellKnownURLs(websiteURL) if localizedError != nil { @@ -298,20 +307,24 @@ func (f *SubscriptionFinder) FindSubscriptionsFromRSSBridge(websiteURL, rssBridg } func (f *SubscriptionFinder) FindSubscriptionsFromYouTubeChannelPage(websiteURL string) (Subscriptions, *locale.LocalizedErrorWrapper) { - matches := youtubeChannelRegex.FindStringSubmatch(websiteURL) + kind, id, _ := youtubeURLIDExtractor(websiteURL) - if len(matches) == 2 { - feedURL := fmt.Sprintf(`https://www.youtube.com/feeds/videos.xml?channel_id=%s`, matches[1]) + if kind == youtubeIDKindChannel { + feedURL := fmt.Sprintf(`https://www.youtube.com/feeds/videos.xml?channel_id=%s`, id) return Subscriptions{NewSubscription(websiteURL, feedURL, parser.FormatAtom)}, nil } - slog.Debug("This website is not a YouTube channel page, the regex doesn't match", slog.String("website_url", websiteURL)) return nil, nil } func (f *SubscriptionFinder) FindSubscriptionsFromYouTubeVideoPage(websiteURL string) (Subscriptions, *locale.LocalizedErrorWrapper) { - if !youtubeVideoRegex.MatchString(websiteURL) { + kind, _, err := youtubeURLIDExtractor(websiteURL) + if err != nil { + slog.Debug("Could not parse url", slog.String("website_url", websiteURL)) + } + + if kind != youtubeIDKindVideo { slog.Debug("This website is not a YouTube video page, the regex doesn't match", slog.String("website_url", websiteURL)) return nil, nil } @@ -337,10 +350,10 @@ func (f *SubscriptionFinder) FindSubscriptionsFromYouTubeVideoPage(websiteURL st } func (f *SubscriptionFinder) FindSubscriptionsFromYouTubePlaylistPage(websiteURL string) (Subscriptions, *locale.LocalizedErrorWrapper) { - matches := youtubePlaylistRegex.FindStringSubmatch(websiteURL) + kind, id, _ := youtubeURLIDExtractor(websiteURL) - if len(matches) == 2 { - feedURL := fmt.Sprintf(`https://www.youtube.com/feeds/videos.xml?playlist_id=%s`, matches[1]) + if kind == youtubeIDKindPlaylist { + feedURL := fmt.Sprintf(`https://www.youtube.com/feeds/videos.xml?playlist_id=%s`, id) return Subscriptions{NewSubscription(websiteURL, feedURL, parser.FormatAtom)}, nil } @@ -348,3 +361,38 @@ func (f *SubscriptionFinder) FindSubscriptionsFromYouTubePlaylistPage(websiteURL return nil, nil } + +func youtubeURLIDExtractor(websiteURL string) (idKind youtubeKind, id string, err error) { + decodedUrl, err := url.Parse(websiteURL) + if err != nil { + return + } + + if !youtubeHostRegex.MatchString(decodedUrl.Host) { + slog.Debug("This website is not a YouTube page, the regex doesn't match", slog.String("website_url", websiteURL)) + err = errNotYoutubeUrl + return + } + + switch { + case strings.HasPrefix(decodedUrl.Path, "/channel"): + idKind = youtubeIDKindChannel + matches := youtubeChannelRegex.FindStringSubmatch(decodedUrl.Path) + id = matches[1] + return + case strings.HasPrefix(decodedUrl.Path, "/watch") && decodedUrl.Query().Has("list"): + idKind = youtubeIDKindPlaylist + id = decodedUrl.Query().Get("list") + return + case strings.HasPrefix(decodedUrl.Path, "/watch"): + idKind = youtubeIDKindVideo + id = decodedUrl.Query().Get("v") + return + case strings.HasPrefix(decodedUrl.Path, "/playlist"): + idKind = youtubeIDKindPlaylist + id = decodedUrl.Query().Get("list") + return + } + err = fmt.Errorf("unable to extract youtube id from URL: %s", websiteURL) + return +} diff --git a/internal/reader/subscription/finder_test.go b/internal/reader/subscription/finder_test.go index 216d81cb..9e7416a2 100644 --- a/internal/reader/subscription/finder_test.go +++ b/internal/reader/subscription/finder_test.go @@ -4,14 +4,16 @@ package subscription import ( + "errors" "strings" "testing" ) func TestFindYoutubePlaylistFeed(t *testing.T) { scenarios := map[string]string{ - "https://www.youtube.com/playlist?list=PLOOwEPgFWm_NHcQd9aCi5JXWASHO_n5uR": "https://www.youtube.com/feeds/videos.xml?playlist_id=PLOOwEPgFWm_NHcQd9aCi5JXWASHO_n5uR", - "https://www.youtube.com/playlist?list=PLOOwEPgFWm_N42HlCLhqyJ0ZBWr5K1QDM": "https://www.youtube.com/feeds/videos.xml?playlist_id=PLOOwEPgFWm_N42HlCLhqyJ0ZBWr5K1QDM", + "https://www.youtube.com/playlist?list=PLOOwEPgFWm_NHcQd9aCi5JXWASHO_n5uR": "https://www.youtube.com/feeds/videos.xml?playlist_id=PLOOwEPgFWm_NHcQd9aCi5JXWASHO_n5uR", + "https://www.youtube.com/playlist?list=PLOOwEPgFWm_N42HlCLhqyJ0ZBWr5K1QDM": "https://www.youtube.com/feeds/videos.xml?playlist_id=PLOOwEPgFWm_N42HlCLhqyJ0ZBWr5K1QDM", + "https://www.youtube.com/watch?v=dQw4w9WgXcQ&list=PLOOwEPgFWm_N42HlCLhqyJ0ZBWr5K1QDM": "https://www.youtube.com/feeds/videos.xml?playlist_id=PLOOwEPgFWm_N42HlCLhqyJ0ZBWr5K1QDM", } for websiteURL, expectedFeedURL := range scenarios { @@ -30,6 +32,71 @@ func TestFindYoutubePlaylistFeed(t *testing.T) { } } +func TestItDoesNotConsiderPlaylistWatchPageAsVideoWatchPage(t *testing.T) { + _, localizedError := NewSubscriptionFinder(nil).FindSubscriptionsFromYouTubeVideoPage("https://www.youtube.com/watch?v=dQw4w9WgXcQ&list=PLOOwEPgFWm_N42HlCLhqyJ0ZBWr5K1QDM") + if localizedError != nil { + t.Fatalf(`Should not consider a playlist watch page as a video watch page`) + } +} + +func TestYoutubeIdExtractor(t *testing.T) { + type testResult struct { + ID string + Kind youtubeKind + error error + } + urls := map[string]testResult{ + "https://www.youtube.com/watch?v=dQw4w9WgXcQ": { + ID: "dQw4w9WgXcQ", + Kind: youtubeIDKindVideo, + error: nil, + }, + "https://www.youtube.com/watch?v=dQw4w9WgXcQ&t=1": { + ID: "dQw4w9WgXcQ", + Kind: youtubeIDKindVideo, + error: nil, + }, + "https://www.youtube.com/watch?t=1&v=dQw4w9WgXcQ": { + ID: "dQw4w9WgXcQ", + Kind: youtubeIDKindVideo, + error: nil, + }, + "https://www.youtube.com/watch?v=dQw4w9WgXcQ&list=PLOOwEPgFWm_N42HlCLhqyJ0ZBWr5K1QDM": { + ID: "PLOOwEPgFWm_N42HlCLhqyJ0ZBWr5K1QDM", + Kind: youtubeIDKindPlaylist, + error: nil, + }, + "https://www.youtube.com/playlist?list=PLOOwEPgFWm_NHcQd9aCi5JXWASHO_n5uR": { + ID: "PLOOwEPgFWm_NHcQd9aCi5JXWASHO_n5uR", + Kind: youtubeIDKindPlaylist, + error: nil, + }, + "https://www.youtube.com/channel/UC-Qj80avWItNRjkZ41rzHyw": { + ID: "UC-Qj80avWItNRjkZ41rzHyw", + Kind: youtubeIDKindChannel, + error: nil, + }, + "https://www.example.com/channel/UC-Qj80avWItNRjkZ41rzHyw": { + ID: "", + Kind: "", + error: errNotYoutubeUrl, + }, + } + + for websiteURL, expected := range urls { + kind, id, err := youtubeURLIDExtractor(websiteURL) + if !errors.Is(err, expected.error) { + t.Fatalf(`Unexpected error: %v got %v`, expected.error, err) + } + if id != expected.ID { + t.Fatalf(`Unexpected ID: %v got %v`, expected.ID, id) + } + if kind != expected.Kind { + t.Fatalf(`Unexpected Kind: %v got %v`, expected.Kind, kind) + } + } +} + func TestFindYoutubeChannelFeed(t *testing.T) { scenarios := map[string]string{ "https://www.youtube.com/channel/UC-Qj80avWItNRjkZ41rzHyw": "https://www.youtube.com/feeds/videos.xml?channel_id=UC-Qj80avWItNRjkZ41rzHyw",