Add proper YouTube archiving via YT-DLP (#126)

* add: yt-dlp support to gather YouTube URLs from watch pages * [site/yt] add: format selection & metadata record * [ext/m3u8] initial commit * fix: remove default global HTTP timeout * [site/yt] wip: fix tests * chores: small refactoring * [site/yt] fix test * ytdlp: remove useless subtitles parsing function * m3u8: handle content-type case insensitively * chore: small refactoring * ytdlp: add dubbed audio streams * ytdlp: format selection & refactoring
internetarchive · Sep 12, 2024 · cfa2980 · cfa2980
1 parent 6d512bb
commit cfa2980
Show file tree

Hide file tree

Showing 21 changed files with 714 additions and 174 deletions.
diff --git a/cmd/get.go b/cmd/get.go
@@ -43,7 +43,7 @@ func getCMDsFlags(getCmd *cobra.Command) {
 	getCmd.PersistentFlags().String("prometheus-prefix", "zeno:", "String used as a prefix for the exported Prometheus metrics.")
 	getCmd.PersistentFlags().Int("max-redirect", 20, "Specifies the maximum number of redirections to follow for a resource.")
 	getCmd.PersistentFlags().Int("max-retry", 5, "Number of retry if error happen when executing HTTP request.")
-	getCmd.PersistentFlags().Int("http-timeout", 30, "Number of seconds to wait before timing out a request.")
+	getCmd.PersistentFlags().Int("http-timeout", -1, "Number of seconds to wait before timing out a request.")
 	getCmd.PersistentFlags().Bool("domains-crawl", false, "If this is turned on, seeds will be treated as domains to crawl, therefore same-domain outlinks will be added to the queue as hop=0.")
 	getCmd.PersistentFlags().StringSlice("disable-html-tag", []string{}, "Specify HTML tag to not extract assets from")
 	getCmd.PersistentFlags().Bool("capture-alternate-pages", false, "If turned on, <link> HTML tags with \"alternate\" values for their \"rel\" attribute will be archived.")
@@ -84,6 +84,10 @@ func getCMDsFlags(getCmd *cobra.Command) {
 	getCmd.PersistentFlags().String("es-password", "", "ElasticSearch password to use for indexing crawl logs.")
 	getCmd.PersistentFlags().String("es-index-prefix", "zeno", "ElasticSearch index prefix to use for indexing crawl logs. Default is : `zeno`, without `-`")
 
+	// Dependencies flags
+	getCmd.PersistentFlags().Bool("no-ytdlp", false, "Disable youtube-dlp usage for video extraction.")
+	getCmd.PersistentFlags().String("ytdlp-path", "", "Path to youtube-dlp binary.")
+
 	// Alias support
 	// As cobra doesn't support aliases natively (couldn't find a way to do it), we have to do it manually
 	// This is a workaround to allow users to use `--hops` instead of `--max-hops` for example

diff --git a/config/config.go b/config/config.go
@@ -76,6 +76,10 @@ type Config struct {
 	NoStdoutLogging                bool     `mapstructure:"no-stdout-log"`
 	NoBatchWriteWAL                bool     `mapstructure:"ultrasafe-queue"`
 	Handover                       bool     `mapstructure:"handover"`
+
+	// Dependencies
+	NoYTDLP   bool   `mapstructure:"no-ytdlp"`
+	YTDLPPath string `mapstructure:"ytdlp-path"`
 }
 
 var (

diff --git a/go.mod b/go.mod
@@ -13,6 +13,7 @@ require (
 	github.com/google/uuid v1.6.0
 	github.com/gosuri/uilive v0.0.4
 	github.com/gosuri/uitable v0.0.4
+	github.com/grafov/m3u8 v0.12.0
 	github.com/paulbellamy/ratecounter v0.2.0
 	github.com/philippgille/gokv/leveldb v0.7.0
 	github.com/prometheus/client_golang v1.20.3

diff --git a/go.sum b/go.sum
@@ -57,6 +57,12 @@ github.com/gosuri/uilive v0.0.4 h1:hUEBpQDj8D8jXgtCdBu7sWsy5sbW/5GhuO8KBwJ2jyY=
 github.com/gosuri/uilive v0.0.4/go.mod h1:V/epo5LjjlDE5RJUcqx8dbw+zc93y5Ya3yg8tfZ74VI=
 github.com/gosuri/uitable v0.0.4 h1:IG2xLKRvErL3uhY6e1BylFzG+aJiwQviDDTfOKeKTpY=
 github.com/gosuri/uitable v0.0.4/go.mod h1:tKR86bXuXPZazfOTG1FIzvjIdXzd0mo4Vtn16vt0PJo=
+github.com/grafana/pyroscope-go v1.1.2 h1:7vCfdORYQMCxIzI3NlYAs3FcBP760+gWuYWOyiVyYx8=
+github.com/grafana/pyroscope-go v1.1.2/go.mod h1:HSSmHo2KRn6FasBA4vK7BMiQqyQq8KSuBKvrhkXxYPU=
+github.com/grafana/pyroscope-go/godeltaprof v0.1.8 h1:iwOtYXeeVSAeYefJNaxDytgjKtUuKQbJqgAIjlnicKg=
+github.com/grafana/pyroscope-go/godeltaprof v0.1.8/go.mod h1:2+l7K7twW49Ct4wFluZD3tZ6e0SjanjcUUBPVD/UuGU=
+github.com/grafov/m3u8 v0.12.0 h1:T6iTwTsSEtMcwkayef+FJO8kj+Sglr4Lh81Zj8Ked/4=
+github.com/grafov/m3u8 v0.12.0/go.mod h1:nqzOkfBiZJENr52zTVd/Dcl03yzphIMbJqkXGu+u080=
 github.com/hashicorp/hcl v1.0.0 h1:0Anlzjpi4vEasTeNFn2mLJgTSwt0+6sfsiTG8qcWGx4=
 github.com/hashicorp/hcl v1.0.0/go.mod h1:E5yfLk+7swimpb2L/Alb/PJmXilQ/rhwaUYs4T20WEQ=
 github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU=

diff --git a/internal/pkg/crawl/assets.go b/internal/pkg/crawl/assets.go
@@ -1,20 +1,137 @@
 package crawl
 
 import (
+	"io"
+	"net/http"
 	"net/url"
 	"regexp"
 	"strings"
+	"sync/atomic"
 
 	"github.com/PuerkitoBio/goquery"
 	"github.com/internetarchive/Zeno/internal/pkg/crawl/extractor"
 	"github.com/internetarchive/Zeno/internal/pkg/crawl/sitespecific/cloudflarestream"
 	"github.com/internetarchive/Zeno/internal/pkg/queue"
 	"github.com/internetarchive/Zeno/internal/pkg/utils"
+	"github.com/remeh/sizedwaitgroup"
 )
 
 var backgroundImageRegex = regexp.MustCompile(`(?:\(['"]?)(.*?)(?:['"]?\))`)
 var urlRegex = regexp.MustCompile(`(?m)url\((.*?)\)`)
 
+func (c *Crawl) captureAsset(item *queue.Item, cookies []*http.Cookie, headers map[string]string) error {
+	var resp *http.Response
+
+	// Prepare GET request
+	req, err := http.NewRequest("GET", utils.URLToString(item.URL), nil)
+	if err != nil {
+		return err
+	}
+
+	req.Header.Set("Referer", utils.URLToString(item.ParentURL))
+	req.Header.Set("User-Agent", c.UserAgent)
+
+	// If headers are passed, apply them to the request
+	if headers != nil {
+		for key, value := range headers {
+			req.Header.Set(key, value)
+		}
+	}
+
+	// Apply cookies obtained from the original URL captured
+	for i := range cookies {
+		req.AddCookie(cookies[i])
+	}
+
+	resp, err = c.executeGET(item, req, false)
+	if err != nil && err.Error() == "URL from redirection has already been seen" {
+		return nil
+	} else if err != nil {
+		return err
+	}
+	defer resp.Body.Close()
+
+	if extractor.IsM3U8(resp) {
+		assets, err := extractor.M3U8(resp)
+		if err == nil {
+			c.captureAssets(item, assets, cookies, headers)
+		} else {
+			c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("unable to extract URLs from M3U8")
+		}
+	}
+
+	io.Copy(io.Discard, resp.Body)
+
+	return nil
+}
+
+func (c *Crawl) captureAssets(item *queue.Item, assets []*url.URL, cookies []*http.Cookie, headers map[string]string) {
+	// TODO: implement a counter for the number of assets
+	// currently being processed
+	// c.Frontier.QueueCount.Incr(int64(len(assets)))
+	swg := sizedwaitgroup.New(int(c.MaxConcurrentAssets))
+	excluded := false
+
+	for _, asset := range assets {
+		// TODO: implement a counter for the number of assets
+		// currently being processed
+		// c.Frontier.QueueCount.Incr(-1)
+
+		// Just making sure we do not over archive by archiving the original URL
+		if utils.URLToString(item.URL) == utils.URLToString(asset) {
+			continue
+		}
+
+		// If the URL match any excluded string, we ignore it
+		for _, excludedString := range c.ExcludedStrings {
+			if strings.Contains(utils.URLToString(asset), excludedString) {
+				excluded = true
+				break
+			}
+		}
+
+		if excluded {
+			excluded = false
+			continue
+		}
+
+		swg.Add()
+		c.URIsPerSecond.Incr(1)
+
+		go func(asset *url.URL, swg *sizedwaitgroup.SizedWaitGroup) {
+			defer swg.Done()
+
+			// Create the asset's item
+			newAsset, err := queue.NewItem(asset, item.URL, "asset", item.Hop, "", false)
+			if err != nil {
+				c.Log.WithFields(c.genLogFields(err, asset, map[string]interface{}{
+					"parentHop": item.Hop,
+					"parentUrl": utils.URLToString(item.URL),
+					"type":      "asset",
+				})).Error("error while creating asset item")
+				return
+			}
+
+			// Capture the asset
+			err = c.captureAsset(newAsset, cookies, headers)
+			if err != nil {
+				c.Log.WithFields(c.genLogFields(err, &asset, map[string]interface{}{
+					"parentHop": item.Hop,
+					"parentUrl": utils.URLToString(item.URL),
+					"type":      "asset",
+				})).Error("error while capturing asset")
+				return
+			}
+
+			// If we made it to this point, it means that the asset have been crawled successfully,
+			// then we can increment the locallyCrawled variable
+			atomic.AddUint64(&item.LocallyCrawled, 1)
+		}(asset, &swg)
+	}
+
+	swg.Wait()
+}
+
 func (c *Crawl) extractAssets(base *url.URL, item *queue.Item, doc *goquery.Document) (assets []*url.URL, err error) {
 	var rawAssets []string
 	var URL = utils.URLToString(item.URL)
@@ -198,7 +315,7 @@ func (c *Crawl) extractAssets(base *url.URL, item *queue.Item, doc *goquery.Docu
 						if err != nil {
 							c.Log.Error("unable to extract URLs from JSON in script tag", "error", err, "url", URL)
 						} else {
-							rawAssets = append(rawAssets, removeGoogleVideoURLs(URLsFromJSON)...)
+							rawAssets = append(rawAssets, URLsFromJSON...)
 						}
 					}
 				}
@@ -274,21 +391,26 @@ func (c *Crawl) extractAssets(base *url.URL, item *queue.Item, doc *goquery.Docu
 	// Turn strings into url.URL
 	assets = append(assets, utils.StringSliceToURLSlice(rawAssets)...)
 
-	// Ensure that excluded hosts aren't in the assets.
-	assets = c.excludeHosts(assets)
-
-	// Go over all assets and outlinks and make sure they are absolute links
-	assets = utils.MakeAbsolute(base, assets)
+	// Ensure that no asset that would be excluded is added to the list,
+	// remove all fragments, and make sure that all assets are absolute URLs
+	assets = c.cleanURLs(base, assets)
 
 	return utils.DedupeURLs(assets), nil
 }
 
-func removeGoogleVideoURLs(input []string) (output []string) {
-	for _, i := range input {
-		if !strings.Contains(i, "googlevideo.com") {
-			output = append(output, i)
+func (c *Crawl) cleanURLs(base *url.URL, URLs []*url.URL) (output []*url.URL) {
+	// Remove excluded URLs
+	for _, URL := range URLs {
+		if !c.isExcluded(URL) {
+			output = append(output, URL)
 		}
 	}
 
-	return output
+	// Make all URLs absolute
+	if base != nil {
+		output = utils.MakeAbsolute(base, output)
+	}
+
+	// Remove fragments
+	return utils.RemoveFragments(output)
 }