Skip to content

Commit

Permalink
Merge pull request #3654 from rebeccacremona/only_save_wacz
Browse files Browse the repository at this point in the history
Save only WACZ, not WARC
  • Loading branch information
rebeccacremona authored Nov 15, 2024
2 parents 7230e0e + d4747b8 commit 39785bd
Show file tree
Hide file tree
Showing 11 changed files with 155 additions and 123 deletions.
39 changes: 20 additions & 19 deletions perma_web/api/tests/test_link_resource.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
import dateutil.parser
from io import StringIO
from glob import glob

import os
import dateutil.parser
import re
from requests.exceptions import RequestException
from requests import request as orig_request

from django.conf import settings
from django.core.files.storage import storages
from django.urls import reverse
from django.http import StreamingHttpResponse
from django.test.utils import override_settings
from io import StringIO
import re
from requests.exceptions import RequestException
from requests import request as orig_request

from mock import patch
import pytest

Expand Down Expand Up @@ -93,7 +93,7 @@ def setUp(self):
'private_reason',
]

def assertRecordsInWarc(self, link, upload=False, expected_records=None, check_screenshot=False, check_provenance_summary=False):
def assertRecordsInArchive(self, link, upload=False, expected_records=None, check_screenshot=False, check_provenance_summary=False, filetype='wacz'):

def find_recording_in_warc(index, capture_url, content_type):
warc_content_type = "application/http; msgtype=response"
Expand Down Expand Up @@ -128,7 +128,8 @@ def find_attachment_in_warc(index, capture_url):
self.assertTrue(link.primary_capture.content_type, "Capture is missing a content type.")

# create an index of the warc
with storages[settings.WARC_STORAGE].open(link.warc_storage_file(), 'rb') as warc_file:
extract = filetype == 'wacz'
with link.get_warc(extract) as warc_file:
index = index_warc_file(warc_file)

# see if the index reports the content is in the warc
Expand Down Expand Up @@ -401,14 +402,14 @@ def test_should_create_archive_from_html_url(self):
user=self.org_user)

link = Link.objects.get(guid=obj['guid'])
self.assertRecordsInWarc(link, check_screenshot=True, check_provenance_summary=True)
self.assertRecordsInArchive(link, check_screenshot=True, check_provenance_summary=True)
self.assertTrue(link.primary_capture.content_type.startswith('text/html'))
self.assertFalse(link.is_private)
self.assertEqual(link.submitted_title, "Test title.")
self.assertEqual(link.submitted_description, "Test description.")
self.assertRegex(link.captured_by_software, r'scoop @ harvard library innovation lab: \d+\.\d+.\d+')
expected_size = 15340
self.assertLessEqual(abs(link.warc_size-expected_size), 100)
expected_size = 21954
self.assertLessEqual(abs(link.wacz_size-expected_size), 100)

# check folder
self.assertTrue(link.folders.filter(pk=target_folder.pk).exists())
Expand All @@ -425,7 +426,7 @@ def test_should_create_archive_from_pdf_url(self, allowed):
user=self.org_user)

link = Link.objects.get(guid=obj['guid'])
self.assertRecordsInWarc(link, check_provenance_summary=True)
self.assertRecordsInArchive(link, check_provenance_summary=True)
self.assertEqual(link.primary_capture.content_type, 'application/pdf')

# check folder
Expand Down Expand Up @@ -597,7 +598,7 @@ def test_media_capture_in_iframes(self):
("wide1.png", "image/png"), ("wide2.png", "image/png"), ("narrow.png", "image/png")
]
link = Link.objects.get(guid=obj['guid'])
self.assertRecordsInWarc(link, expected_records=expected_records)
self.assertRecordsInArchive(link, expected_records=expected_records)


#########################
Expand All @@ -612,7 +613,7 @@ def test_should_create_archive_from_pdf_file(self):
user=self.org_user)

link = Link.objects.get(guid=obj['guid'])
self.assertRecordsInWarc(link, upload=True)
self.assertRecordsInArchive(link, upload=True, filetype='warc')
self.assertEqual(link.primary_capture.user_upload, True)

def test_should_create_archive_from_jpg_file(self):
Expand All @@ -623,7 +624,7 @@ def test_should_create_archive_from_jpg_file(self):
user=self.org_user)

link = Link.objects.get(guid=obj['guid'])
self.assertRecordsInWarc(link, upload=True)
self.assertRecordsInArchive(link, upload=True, filetype='warc')
self.assertEqual(link.primary_capture.user_upload, True)

def test_should_reject_jpg_file_with_invalid_url(self):
Expand All @@ -644,7 +645,7 @@ def test_should_should_create_archive_from_jpg_file_with_nonloading_url(self):

link = Link.objects.get(guid=obj['guid'])
self.assertEqual(link.submitted_url, 'http://asdf.asdf')
self.assertRecordsInWarc(link, upload=True)
self.assertRecordsInArchive(link, upload=True, filetype='warc')
self.assertEqual(link.primary_capture.user_upload, True)

def test_should_reject_invalid_file(self):
Expand Down Expand Up @@ -725,7 +726,7 @@ def test_custom_title_not_overridden(self):
user=self.org_user)

link = Link.objects.get(guid=obj['guid'])
self.assertRecordsInWarc(link)
self.assertRecordsInArchive(link)
self.assertEqual(link.submitted_title, custom_title)
self.assertEqual(link.submitted_description, "Test description.")

Expand All @@ -737,7 +738,7 @@ def test_no_title_or_description_found(self):
user=self.org_user)

link = Link.objects.get(guid=obj['guid'])
self.assertRecordsInWarc(link)
self.assertRecordsInArchive(link)
self.assertEqual(link.submitted_title, link.get_default_title())
self.assertIsNone(link.submitted_description)

Expand Down
66 changes: 40 additions & 26 deletions perma_web/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@
import os
from random import choice
import subprocess
from waffle import get_waffle_flag_model

from django.conf import settings
from django.core.files.storage import storages
from django.core.management import call_command
from django.core.serializers.json import DjangoJSONEncoder
from django.urls import reverse
Expand Down Expand Up @@ -116,24 +116,23 @@ def cleanup_storage():
storage.objects.delete()


URL_MAP = {
'homepage': reverse('landing'),
'login': reverse('user_management_limited_login'),
'about': reverse('about'),
'contact': reverse('contact'),
'folders': reverse('create_link'),
'bookmarklet': reverse('service_bookmarklet_create'),
}


class URLs:
def __init__(self, base_url):
for name, url in URL_MAP.items():
setattr(self, name, base_url + url)
@pytest.fixture
def urls(transactional_db, live_server_ssl, complete_link_with_warc):
urls = {
'homepage': reverse('landing'),
'login': reverse('user_management_limited_login'),
'about': reverse('about'),
'contact': reverse('contact'),
'folders': reverse('create_link'),
'bookmarklet': reverse('service_bookmarklet_create'),
'perma_link_with_warc': reverse('single_permalink', args=[complete_link_with_warc.guid])
}

class URLs:
def __init__(self, base_url):
for name, url in urls.items():
setattr(self, name, base_url + url)

@pytest.fixture
def urls(transactional_db, live_server_ssl):
return URLs(f'https://{settings.HOST}')


Expand All @@ -148,15 +147,6 @@ def user() -> User:
return User("functional_test_user@example.com", "pass")


@pytest.fixture
def wacz_user() -> User:
"""For this user, the 'wacz-playback' flag is True"""
u = LinkUser.objects.get(email="wacz_functional_test_user@example.com")
flag, _created = get_waffle_flag_model().objects.get_or_create(name="wacz-playback")
flag.users.add(u.id)
return User(u.email, "pass")


@pytest.fixture
def log_in_user(urls):
"""A utility to log in the desired user"""
Expand Down Expand Up @@ -950,6 +940,30 @@ def complete_link(complete_link_factory):
return complete_link_factory()


@pytest.fixture
def complete_link_with_warc(complete_link_factory):
link = complete_link_factory({
"submitted_url": "http://example.com",
"submitted_title": "Example Domain"
})

with open(
os.path.join(
settings.PROJECT_ROOT, "perma/tests/assets/new_style_archive/archive.warc.gz"
),
"rb",
) as warc_file:

storages[settings.WARC_STORAGE].store_file(
warc_file, link.warc_storage_file(), overwrite=True
)
link.warc_size = warc_file.tell()
link.save()

return link



@pytest.fixture
def complete_link_without_capture_job(complete_link):
complete_link.capture_job.delete()
Expand Down
36 changes: 22 additions & 14 deletions perma_web/functional_tests/test_links.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

two_minutes = 120 * 1000


def create_link(page):
"""
A helper:
Expand All @@ -17,38 +18,42 @@ def create_link(page):
url_field.type("https://example.com/")
page.locator('#addlink').click()
page.wait_for_url(re.compile('/[A-Za-z0-9]{4}-[A-Za-z0-9]{4}$'), timeout=two_minutes)


def check_example_playback(page):
expect(page).to_have_title('Perma | Example Domain')
expect(page.frame_locator('.archive-iframe')
.frame_locator('iframe')
.frame_locator('iframe')
.locator('h1')).to_contain_text("Example Domain")

def test_create_link_warc_playback(page, user, log_in_user) -> None:

def test_create_link_wacz_playback(page, user, log_in_user) -> None:
"""
It should be possible to successfully create a link from a URL.
This user (no feature flag) should see a WARC playback.
This user should see a WACZ playback.
"""
log_in_user(page, user)
create_link(page)
check_example_playback(page)

# Verify we are seing a WARC playback, not a WACZ playback
assert ".warc.gz?" in page.content()
assert ".wacz?" not in page.content()
# Verify we are seing a WACZ playback, not a WARC playback
assert ".warc.gz?" not in page.content()
assert ".wacz?" in page.content()


def test_create_link_wacz_playback(page, wacz_user, log_in_user) -> None:
def test_warc_playback(page, user, log_in_user, urls) -> None:
"""
It should be possible to successfully create a link from a URL.
This user (with feature flag set) should see a WACZ playback.
The WARC of a legacy Perma Link with no WACZ should play back.
"""
log_in_user(page, wacz_user)
create_link(page)
log_in_user(page, user)
page.goto(urls.perma_link_with_warc)
check_example_playback(page)

# Verify we are seing a WACZ playback, not a WARC playback
assert ".warc.gz?" not in page.content()
assert ".wacz?" in page.content()
# Verify we are seing a WARC playback, not a WACZ playback
assert ".warc.gz?" in page.content()
assert ".wacz?" not in page.content()


def test_link_required(page, user, log_in_user) -> None:
Expand All @@ -58,6 +63,7 @@ def test_link_required(page, user, log_in_user) -> None:
page.locator('#addlink').click()
expect(page.locator("#error-container")).to_contain_text("URL cannot be empty")


def test_upload_nonexistent(page, user, log_in_user) -> None:
"""A modal should be displayed if the user input a domain we can't resolve"""
log_in_user(page, user)
Expand All @@ -68,6 +74,7 @@ def test_upload_nonexistent(page, user, log_in_user) -> None:
page.locator('#addlink').click()
expect(page.locator("#error-container")).to_contain_text("Couldn't resolve domain.")


def test_bookmarklet_redirect(page, user, log_in_user, urls) -> None:
"""Test that the URL parameter prepopulates the input field for the bookmarklet."""
log_in_user(page, user)
Expand All @@ -80,6 +87,7 @@ def test_bookmarklet_redirect(page, user, log_in_user, urls) -> None:
url_field = page.locator('#rawUrl')
expect(url_field).to_have_value(test_url)


def test_reminder_suppression(page, user, log_in_user):
"""Test that the reminder suppression cookie works."""
log_in_user(page, user)
Expand Down
30 changes: 2 additions & 28 deletions perma_web/perma/celery_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,32 +206,6 @@ def save_scoop_capture(link, capture_job, data):
content_type=supported_attachments[attachment_type]['content_type'],
).save()

#
# WARC
#
if 'warc' in capture_job.archive_formats:
# mode set to 'ab+' as a workaround for https://github.com/python/cpython/issues/69528
with tempfile.TemporaryFile('ab+') as tmp_file:
inc_progress(capture_job, 1, "Downloading web archive file (WARC)")
response, _ = send_to_scoop(
method="get",
path=f"artifact/{data['id_capture']}/archive.warc.gz",
valid_if=lambda code, _: code == 200,
stream=True
)
# Use the raw response, because Python requests standard methods gunzip the file
for chunk in response.raw.stream(10*1024, decode_content=False):
if chunk:
tmp_file.write(chunk)
tmp_file.flush()
link.warc_size = tmp_file.tell()
link.save(update_fields=['warc_size'])
tmp_file.seek(0)

inc_progress(capture_job, 1, "Saving web archive file (WARC)")
storages[settings.WARC_STORAGE].store_file(
tmp_file, link.warc_storage_file(), overwrite=True
)

#
# WACZ
Expand Down Expand Up @@ -703,8 +677,8 @@ def retry_upload(attempt_count, timeout_count):
# copy warc to local disk storage for upload.
# (potentially not necessary, but we think more robust against network conditions
# https://github.com/harvard-lil/perma/commit/25eb14ce634675ffe67d0f14f51308f1202b53ea)
with storages[settings.WARC_STORAGE].open(link.warc_storage_file()) as warc_file:
logger.info(f"Downloading {link.warc_storage_file()} from S3.")
with link.get_warc() as warc_file:
logger.info("Downloading archive from S3.")
copy_file_data(warc_file, temp_warc_file)
temp_warc_file.seek(0)

Expand Down
32 changes: 32 additions & 0 deletions perma_web/perma/migrations/0055_auto_20241114_1703.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Generated by Django 4.2.16 on 2024-11-14 17:03

from django.db import migrations

FLAG_NAME = "wacz-playback"

def delete_wacz_playback_feature_flag(apps, schema_editor):
Flag = apps.get_model("waffle", "Flag")
flags = Flag.objects.filter(name=FLAG_NAME)
flags.delete()

def create_wacz_playback_feature_flag(apps, schema_editor):
Flag = apps.get_model("waffle", "Flag")
flag = Flag(
name=FLAG_NAME,
testing=True
)
flag.save()

class Migration(migrations.Migration):

dependencies = [
('perma', '0054_capturejob_perma_captu_capture_daadd2_idx'),
('waffle', '0004_update_everyone_nullbooleanfield'),
]

operations = [
migrations.RunPython(
delete_wacz_playback_feature_flag,
create_wacz_playback_feature_flag,
),
]
Loading

0 comments on commit 39785bd

Please sign in to comment.