Skip to content

Commit

Permalink
add additional validation for lit url uploads (#887)
Browse files Browse the repository at this point in the history
Co-authored-by: Danny Peterson <113325602+dannypeterson@users.noreply.github.com>
  • Loading branch information
shapiromatron and dannypeterson authored Sep 18, 2023
1 parent 8943a38 commit e970dad
Show file tree
Hide file tree
Showing 3 changed files with 85 additions and 5 deletions.
12 changes: 7 additions & 5 deletions hawc/apps/lit/forms.py
Original file line number Diff line number Diff line change
Expand Up @@ -584,10 +584,8 @@ def clean_excel_file(self):
fn = self.cleaned_data["excel_file"]

# check extension
if fn.name[-5:] not in [".xlsx", ".xlsm"] and fn.name[-4:] not in [".xls"]:
raise forms.ValidationError(
"Must be an Excel file with an " "xlsx, xlsm, or xls file extension."
)
if fn.name[-5:] != ".xlsx":
raise forms.ValidationError("Must be an Excel file with an xlsx extension.")

# check parsing
try:
Expand All @@ -600,8 +598,12 @@ def clean_excel_file(self):
if df.columns.tolist() != ["HAWC ID", "Full text URL"]:
raise forms.ValidationError(self.EXCEL_FORMAT_ERROR)

try:
hawc_ids = df["HAWC ID"].astype(int).tolist()
except pd.errors.IntCastingNaNError:
raise forms.ValidationError("HAWC IDs must be integers.")

# check valid HAWC IDs
hawc_ids = df["HAWC ID"].tolist()
qs = models.Reference.objects.assessment_qs(self.assessment.id).filter(id__in=hawc_ids)
if unmatched := (set(hawc_ids) - set(qs.values_list("id", flat=True))):
raise forms.ValidationError(f"Invalid HAWC IDs: {list(unmatched)}")
Expand Down
69 changes: 69 additions & 0 deletions tests/hawc/apps/lit/test_forms.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import re
from pathlib import Path

import pandas as pd
import pytest
from django.core.files.uploadedfile import SimpleUploadedFile
from django.forms.models import model_to_dict
Expand All @@ -11,13 +12,16 @@
BulkReferenceStudyExtractForm,
ImportForm,
LiteratureAssessmentForm,
ReferenceExcelUploadForm,
ReferenceForm,
RisImportForm,
)
from hawc.apps.lit.models import Reference
from hawc.apps.study.models import Study
from hawc.services.utils.ris import ReferenceParser

from ..test_utils import df_to_form_data


@pytest.mark.django_db
class TestLiteratureAssessmentForm:
Expand Down Expand Up @@ -441,3 +445,68 @@ def test_validation_failures(self, db_keys):
assert form.errors == {
"study_type": ["Select a valid choice. crazy is not one of the available choices."]
}


@pytest.mark.django_db
class TestReferenceExcelUploadForm:
def test_success(self):
df = pd.DataFrame(
data={
"HAWC ID": [1],
"Full text URL": ["https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5448372/"],
}
)
form = ReferenceExcelUploadForm(
instance={},
assessment=Assessment.objects.get(id=1),
data={},
files=df_to_form_data("excel_file", df),
)
assert form.is_valid() is True

def test_validation(self):
assessment = Assessment.objects.get(id=1)

# Incorrect file extension
form = ReferenceExcelUploadForm(
instance={},
assessment=assessment,
data={},
files={"excel_file": SimpleUploadedFile("z", b"test")},
)
assert form.is_valid() is False
assert "Must be an Excel file with an xlsx extension." in form.errors["excel_file"][0]

# Incorrect file format
form = ReferenceExcelUploadForm(
instance={},
assessment=assessment,
data={},
files={"excel_file": SimpleUploadedFile("test.xlsx", b"test")},
)
assert form.is_valid() is False
assert "Invalid Excel format." in form.errors["excel_file"][0]

# Incorrect data in Excel
datasets = [
# incorrect column names
({"test": [1, 2, 3]}, "Invalid Excel format."),
# non-integer HAWC IDs
(
{"HAWC ID": [""], "Full text URL": ["https://www.ncbi.nlm.nih.gov/"]},
"HAWC IDs must be integers.",
),
# non-URL full text URL
({"HAWC ID": [1], "Full text URL": [None]}, "Invalid URLs"),
({"HAWC ID": [1], "Full text URL": [""]}, "Invalid URLs"),
({"HAWC ID": [1], "Full text URL": ["test"]}, "Invalid URLs"),
]
for data, error_msg in datasets:
form = ReferenceExcelUploadForm(
instance={},
assessment=assessment,
data={},
files=df_to_form_data("excel_file", pd.DataFrame(data)),
)
assert form.is_valid() is False
assert error_msg in form.errors["excel_file"][0]
9 changes: 9 additions & 0 deletions tests/hawc/apps/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,11 @@
"""

import json
from io import BytesIO
from pathlib import Path

import pandas as pd
from django.core.files.uploadedfile import SimpleUploadedFile
from django.http import HttpResponse
from django.test.client import Client
from rest_framework.response import Response
Expand Down Expand Up @@ -88,3 +91,9 @@ def check_200(
response = client.get(url, **kw)
assert response.status_code == 200
return response


def df_to_form_data(key: str, df: pd.DataFrame) -> dict:
f = BytesIO()
df.to_excel(f, index=False)
return {key: SimpleUploadedFile("test.xlsx", f.getvalue())}

0 comments on commit e970dad

Please sign in to comment.