Skip to content

Commit

Permalink
Add proper YouTube archiving via YT-DLP (#126)
Browse files Browse the repository at this point in the history
* add: yt-dlp support to gather YouTube URLs from watch pages

* [site/yt] add: format selection & metadata record

* [ext/m3u8] initial commit

* fix: remove default global HTTP timeout

* [site/yt] wip: fix tests

* chores: small refactoring

* [site/yt] fix test

* ytdlp: remove useless subtitles parsing function

* m3u8: handle content-type case insensitively

* chore: small refactoring

* ytdlp: add dubbed audio streams

* ytdlp: format selection & refactoring
  • Loading branch information
CorentinB authored Sep 12, 2024
1 parent 6d512bb commit cfa2980
Show file tree
Hide file tree
Showing 21 changed files with 714 additions and 174 deletions.
6 changes: 5 additions & 1 deletion cmd/get.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ func getCMDsFlags(getCmd *cobra.Command) {
getCmd.PersistentFlags().String("prometheus-prefix", "zeno:", "String used as a prefix for the exported Prometheus metrics.")
getCmd.PersistentFlags().Int("max-redirect", 20, "Specifies the maximum number of redirections to follow for a resource.")
getCmd.PersistentFlags().Int("max-retry", 5, "Number of retry if error happen when executing HTTP request.")
getCmd.PersistentFlags().Int("http-timeout", 30, "Number of seconds to wait before timing out a request.")
getCmd.PersistentFlags().Int("http-timeout", -1, "Number of seconds to wait before timing out a request.")
getCmd.PersistentFlags().Bool("domains-crawl", false, "If this is turned on, seeds will be treated as domains to crawl, therefore same-domain outlinks will be added to the queue as hop=0.")
getCmd.PersistentFlags().StringSlice("disable-html-tag", []string{}, "Specify HTML tag to not extract assets from")
getCmd.PersistentFlags().Bool("capture-alternate-pages", false, "If turned on, <link> HTML tags with \"alternate\" values for their \"rel\" attribute will be archived.")
Expand Down Expand Up @@ -84,6 +84,10 @@ func getCMDsFlags(getCmd *cobra.Command) {
getCmd.PersistentFlags().String("es-password", "", "ElasticSearch password to use for indexing crawl logs.")
getCmd.PersistentFlags().String("es-index-prefix", "zeno", "ElasticSearch index prefix to use for indexing crawl logs. Default is : `zeno`, without `-`")

// Dependencies flags
getCmd.PersistentFlags().Bool("no-ytdlp", false, "Disable youtube-dlp usage for video extraction.")
getCmd.PersistentFlags().String("ytdlp-path", "", "Path to youtube-dlp binary.")

// Alias support
// As cobra doesn't support aliases natively (couldn't find a way to do it), we have to do it manually
// This is a workaround to allow users to use `--hops` instead of `--max-hops` for example
Expand Down
4 changes: 4 additions & 0 deletions config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,10 @@ type Config struct {
NoStdoutLogging bool `mapstructure:"no-stdout-log"`
NoBatchWriteWAL bool `mapstructure:"ultrasafe-queue"`
Handover bool `mapstructure:"handover"`

// Dependencies
NoYTDLP bool `mapstructure:"no-ytdlp"`
YTDLPPath string `mapstructure:"ytdlp-path"`
}

var (
Expand Down
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ require (
github.com/google/uuid v1.6.0
github.com/gosuri/uilive v0.0.4
github.com/gosuri/uitable v0.0.4
github.com/grafov/m3u8 v0.12.0
github.com/paulbellamy/ratecounter v0.2.0
github.com/philippgille/gokv/leveldb v0.7.0
github.com/prometheus/client_golang v1.20.3
Expand Down
6 changes: 6 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,12 @@ github.com/gosuri/uilive v0.0.4 h1:hUEBpQDj8D8jXgtCdBu7sWsy5sbW/5GhuO8KBwJ2jyY=
github.com/gosuri/uilive v0.0.4/go.mod h1:V/epo5LjjlDE5RJUcqx8dbw+zc93y5Ya3yg8tfZ74VI=
github.com/gosuri/uitable v0.0.4 h1:IG2xLKRvErL3uhY6e1BylFzG+aJiwQviDDTfOKeKTpY=
github.com/gosuri/uitable v0.0.4/go.mod h1:tKR86bXuXPZazfOTG1FIzvjIdXzd0mo4Vtn16vt0PJo=
github.com/grafana/pyroscope-go v1.1.2 h1:7vCfdORYQMCxIzI3NlYAs3FcBP760+gWuYWOyiVyYx8=
github.com/grafana/pyroscope-go v1.1.2/go.mod h1:HSSmHo2KRn6FasBA4vK7BMiQqyQq8KSuBKvrhkXxYPU=
github.com/grafana/pyroscope-go/godeltaprof v0.1.8 h1:iwOtYXeeVSAeYefJNaxDytgjKtUuKQbJqgAIjlnicKg=
github.com/grafana/pyroscope-go/godeltaprof v0.1.8/go.mod h1:2+l7K7twW49Ct4wFluZD3tZ6e0SjanjcUUBPVD/UuGU=
github.com/grafov/m3u8 v0.12.0 h1:T6iTwTsSEtMcwkayef+FJO8kj+Sglr4Lh81Zj8Ked/4=
github.com/grafov/m3u8 v0.12.0/go.mod h1:nqzOkfBiZJENr52zTVd/Dcl03yzphIMbJqkXGu+u080=
github.com/hashicorp/hcl v1.0.0 h1:0Anlzjpi4vEasTeNFn2mLJgTSwt0+6sfsiTG8qcWGx4=
github.com/hashicorp/hcl v1.0.0/go.mod h1:E5yfLk+7swimpb2L/Alb/PJmXilQ/rhwaUYs4T20WEQ=
github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU=
Expand Down
144 changes: 133 additions & 11 deletions internal/pkg/crawl/assets.go
Original file line number Diff line number Diff line change
@@ -1,20 +1,137 @@
package crawl

import (
"io"
"net/http"
"net/url"
"regexp"
"strings"
"sync/atomic"

"github.com/PuerkitoBio/goquery"
"github.com/internetarchive/Zeno/internal/pkg/crawl/extractor"
"github.com/internetarchive/Zeno/internal/pkg/crawl/sitespecific/cloudflarestream"
"github.com/internetarchive/Zeno/internal/pkg/queue"
"github.com/internetarchive/Zeno/internal/pkg/utils"
"github.com/remeh/sizedwaitgroup"
)

var backgroundImageRegex = regexp.MustCompile(`(?:\(['"]?)(.*?)(?:['"]?\))`)
var urlRegex = regexp.MustCompile(`(?m)url\((.*?)\)`)

func (c *Crawl) captureAsset(item *queue.Item, cookies []*http.Cookie, headers map[string]string) error {
var resp *http.Response

// Prepare GET request
req, err := http.NewRequest("GET", utils.URLToString(item.URL), nil)
if err != nil {
return err
}

req.Header.Set("Referer", utils.URLToString(item.ParentURL))
req.Header.Set("User-Agent", c.UserAgent)

// If headers are passed, apply them to the request
if headers != nil {
for key, value := range headers {
req.Header.Set(key, value)
}
}

// Apply cookies obtained from the original URL captured
for i := range cookies {
req.AddCookie(cookies[i])
}

resp, err = c.executeGET(item, req, false)
if err != nil && err.Error() == "URL from redirection has already been seen" {
return nil
} else if err != nil {
return err
}
defer resp.Body.Close()

if extractor.IsM3U8(resp) {
assets, err := extractor.M3U8(resp)
if err == nil {
c.captureAssets(item, assets, cookies, headers)
} else {
c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("unable to extract URLs from M3U8")
}
}

io.Copy(io.Discard, resp.Body)

return nil
}

func (c *Crawl) captureAssets(item *queue.Item, assets []*url.URL, cookies []*http.Cookie, headers map[string]string) {
// TODO: implement a counter for the number of assets
// currently being processed
// c.Frontier.QueueCount.Incr(int64(len(assets)))
swg := sizedwaitgroup.New(int(c.MaxConcurrentAssets))
excluded := false

for _, asset := range assets {
// TODO: implement a counter for the number of assets
// currently being processed
// c.Frontier.QueueCount.Incr(-1)

// Just making sure we do not over archive by archiving the original URL
if utils.URLToString(item.URL) == utils.URLToString(asset) {
continue
}

// If the URL match any excluded string, we ignore it
for _, excludedString := range c.ExcludedStrings {
if strings.Contains(utils.URLToString(asset), excludedString) {
excluded = true
break
}
}

if excluded {
excluded = false
continue
}

swg.Add()
c.URIsPerSecond.Incr(1)

go func(asset *url.URL, swg *sizedwaitgroup.SizedWaitGroup) {
defer swg.Done()

// Create the asset's item
newAsset, err := queue.NewItem(asset, item.URL, "asset", item.Hop, "", false)
if err != nil {
c.Log.WithFields(c.genLogFields(err, asset, map[string]interface{}{
"parentHop": item.Hop,
"parentUrl": utils.URLToString(item.URL),
"type": "asset",
})).Error("error while creating asset item")
return
}

// Capture the asset
err = c.captureAsset(newAsset, cookies, headers)
if err != nil {
c.Log.WithFields(c.genLogFields(err, &asset, map[string]interface{}{
"parentHop": item.Hop,
"parentUrl": utils.URLToString(item.URL),
"type": "asset",
})).Error("error while capturing asset")
return
}

// If we made it to this point, it means that the asset have been crawled successfully,
// then we can increment the locallyCrawled variable
atomic.AddUint64(&item.LocallyCrawled, 1)
}(asset, &swg)
}

swg.Wait()
}

func (c *Crawl) extractAssets(base *url.URL, item *queue.Item, doc *goquery.Document) (assets []*url.URL, err error) {
var rawAssets []string
var URL = utils.URLToString(item.URL)
Expand Down Expand Up @@ -198,7 +315,7 @@ func (c *Crawl) extractAssets(base *url.URL, item *queue.Item, doc *goquery.Docu
if err != nil {
c.Log.Error("unable to extract URLs from JSON in script tag", "error", err, "url", URL)
} else {
rawAssets = append(rawAssets, removeGoogleVideoURLs(URLsFromJSON)...)
rawAssets = append(rawAssets, URLsFromJSON...)
}
}
}
Expand Down Expand Up @@ -274,21 +391,26 @@ func (c *Crawl) extractAssets(base *url.URL, item *queue.Item, doc *goquery.Docu
// Turn strings into url.URL
assets = append(assets, utils.StringSliceToURLSlice(rawAssets)...)

// Ensure that excluded hosts aren't in the assets.
assets = c.excludeHosts(assets)

// Go over all assets and outlinks and make sure they are absolute links
assets = utils.MakeAbsolute(base, assets)
// Ensure that no asset that would be excluded is added to the list,
// remove all fragments, and make sure that all assets are absolute URLs
assets = c.cleanURLs(base, assets)

return utils.DedupeURLs(assets), nil
}

func removeGoogleVideoURLs(input []string) (output []string) {
for _, i := range input {
if !strings.Contains(i, "googlevideo.com") {
output = append(output, i)
func (c *Crawl) cleanURLs(base *url.URL, URLs []*url.URL) (output []*url.URL) {
// Remove excluded URLs
for _, URL := range URLs {
if !c.isExcluded(URL) {
output = append(output, URL)
}
}

return output
// Make all URLs absolute
if base != nil {
output = utils.MakeAbsolute(base, output)
}

// Remove fragments
return utils.RemoveFragments(output)
}
Loading

0 comments on commit cfa2980

Please sign in to comment.