-
-
Notifications
You must be signed in to change notification settings - Fork 2.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix issue #12000: Parsing arXiv Id when importing a PDF with arXiv Id #12079
base: main
Are you sure you want to change the base?
Changes from 3 commits
19dcb44
36107f6
5d04b96
31e2619
d32ee7a
353f44e
478d8be
19e62f6
1565cc2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -21,6 +21,7 @@ | |
import org.jabref.model.entry.BibEntry; | ||
import org.jabref.model.entry.LinkedFile; | ||
import org.jabref.model.entry.field.StandardField; | ||
import org.jabref.model.entry.identifier.ArXivIdentifier; | ||
import org.jabref.model.entry.identifier.DOI; | ||
import org.jabref.model.entry.types.EntryType; | ||
import org.jabref.model.entry.types.StandardEntryType; | ||
|
@@ -244,6 +245,7 @@ Optional<BibEntry> getEntryFromPDFContent(String firstpageContents, String lineS | |
String title; | ||
String conference = null; | ||
String doi = null; | ||
String arxivId = null; | ||
String series = null; | ||
String volume = null; | ||
String number = null; | ||
|
@@ -256,6 +258,7 @@ Optional<BibEntry> getEntryFromPDFContent(String firstpageContents, String lineS | |
// special case: possibly conference as first line on the page | ||
extractYear(); | ||
doi = getDoi(null); | ||
arxivId = getArxivId(null); | ||
if (curString.contains("Conference")) { | ||
fillCurStringWithNonEmptyLines(); | ||
conference = curString; | ||
|
@@ -388,6 +391,7 @@ Optional<BibEntry> getEntryFromPDFContent(String firstpageContents, String lineS | |
} | ||
} else { | ||
doi = getDoi(doi); | ||
arxivId = getArxivId(arxivId); | ||
|
||
if ((publisher == null) && curString.contains("IEEE")) { | ||
// IEEE has the conference things at the end | ||
|
@@ -445,6 +449,9 @@ Optional<BibEntry> getEntryFromPDFContent(String firstpageContents, String lineS | |
if (doi != null) { | ||
entry.setField(StandardField.DOI, doi); | ||
} | ||
if (arxivId != null) { | ||
entry.setField(StandardField.ARXIVID, arxivId); | ||
} | ||
if (series != null) { | ||
entry.setField(StandardField.SERIES, series); | ||
} | ||
|
@@ -458,6 +465,9 @@ Optional<BibEntry> getEntryFromPDFContent(String firstpageContents, String lineS | |
entry.setField(StandardField.PAGES, pages); | ||
} | ||
if (year != null) { | ||
if (arxivId != null) { | ||
year = "20" + arxivId.substring(0, 2); | ||
} | ||
Comment on lines
+468
to
+470
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This should go to the |
||
entry.setField(StandardField.YEAR, year); | ||
} | ||
if (publisher != null) { | ||
|
@@ -480,6 +490,21 @@ private String getDoi(String doi) { | |
return doi; | ||
} | ||
|
||
private String getArxivId(String arxivId) { | ||
int pos; | ||
if (arxivId == null) { | ||
pos = curString.indexOf("arxiv"); | ||
if (pos < 0) { | ||
pos = curString.indexOf("arXiv"); | ||
} | ||
if (pos >= 0) { | ||
String arxivText = curString.substring(pos); | ||
return ArXivIdentifier.parse(arxivText).map(ArXivIdentifier::getNormalized).orElse(null); | ||
} | ||
} | ||
return arxivId; | ||
} | ||
|
||
private String getFirstPageContents(PDDocument document) throws IOException { | ||
PDFTextStripper stripper = new PDFTextStripper(); | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -25,6 +25,7 @@ public enum StandardField implements Field { | |
ARCHIVEPREFIX("archiveprefix"), | ||
ASSIGNEE("assignee", FieldProperty.PERSON_NAMES), | ||
AUTHOR("author", FieldProperty.PERSON_NAMES), | ||
ARXIVID("arXivId", FieldProperty.VERBATIM), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Arxiv is no StandardField, you need to use eprint for this |
||
BOOKAUTHOR("bookauthor", FieldProperty.PERSON_NAMES), | ||
BOOKPAGINATION("bookpagination", FieldProperty.PAGINATION), | ||
BOOKSUBTITLE("booksubtitle", FieldProperty.BOOK_NAME), | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -45,13 +45,13 @@ public static Optional<ArXivIdentifier> parse(String value) { | |
return getArXivIdentifier(identifierMatcher); | ||
} | ||
|
||
Pattern oldIdentifierPattern = Pattern.compile("(" + ARXIV_PREFIX + ")?\\s?:?\\s?(?<id>(?<classification>[a-z\\-]+(\\.[A-Z]{2})?)/\\d{7})(v(?<version>\\d+))?"); | ||
Matcher oldIdentifierMatcher = oldIdentifierPattern.matcher(identifier); | ||
if (oldIdentifierMatcher.matches()) { | ||
return getArXivIdentifier(oldIdentifierMatcher); | ||
} | ||
Pattern oldIdentifierPattern = Pattern.compile("(" + ARXIV_PREFIX + ")?\\s?:?\\s?(?<id>(?<classification>[a-z\\-]+(\\.[A-Z]{2})?)/\\d{7})(v(?<version>\\d+))?"); | ||
Matcher oldIdentifierMatcher = oldIdentifierPattern.matcher(identifier); | ||
if (oldIdentifierMatcher.matches()) { | ||
return getArXivIdentifier(oldIdentifierMatcher); | ||
} | ||
|
||
return Optional.empty(); | ||
return Optional.empty(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Wrong indent. Please revert. |
||
} | ||
|
||
private static Optional<ArXivIdentifier> getArXivIdentifier(Matcher matcher) { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -123,4 +123,38 @@ British Journal of Nutrition (2008), 99, 1–11 doi: 10.1017/S0007114507795296 | |
|
||
assertEquals(Optional.of(entry), importer.getEntryFromPDFContent(firstPageContent, "\n")); | ||
} | ||
|
||
@Test | ||
void extractArxivIdFromPage1() { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please also use a real arXiv PDF. I think, there is a link to one in the issue? |
||
BibEntry entry = new BibEntry(StandardEntryType.InProceedings) | ||
.withField(StandardField.DOI, "10.1017/S0007114507795296") | ||
.withField(StandardField.AUTHOR, "Review Article") | ||
.withField(StandardField.TITLE, "British Journal of Nutrition (2008), 99, 1–11 doi: 10.1017/S0007114507795296 arXiv:2408.06224v1 q The Authors") | ||
.withField(StandardField.YEAR, "2024") | ||
.withField(StandardField.ARXIVID, "2408.06224v1"); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Needs to be eprint There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for your feedback! I see, I will change it. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You then should also add the correpsonding eprinttype field to arxiv |
||
|
||
String firstPageContent = """ | ||
British Journal of Nutrition (2008), 99, 1–11 doi: 10.1017/S0007114507795296 arXiv:2408.06224v1 | ||
q The Authors 2024 | ||
|
||
Review Article | ||
|
||
Cocoa and health: a decade of research | ||
|
||
Karen A. Cooper1, Jennifer L. Donovan2, Andrew L. Waterhouse3 and Gary Williamson1* | ||
1Nestlé Research Center, Vers-Chez-les-Blanc, PO Box 44, CH-1000 Lausanne 26, Switzerland | ||
2Department of Psychiatry and Behavioural Sciences, Medical University of South Carolina, Charleston, SC 29425, USA | ||
3Department of Viticulture & Enology, University of California, Davis, CA 95616, USA | ||
|
||
(Received 5 December 2006 – Revised 29 May 2007 – Accepted 31 May 2007) | ||
|
||
Abbreviations: FMD, flow-mediated dilation; NO, nitirc oxide. | ||
|
||
*Corresponding author: Dr Gary Williamson, fax þ41 21 785 8544, email gary.williamson@rdls.nestle.com | ||
|
||
British Journal of Nutrition | ||
https://doi.org/10.1017/S0007114507795296 Published online by Cambridge University Press"""; | ||
|
||
assertEquals(Optional.of(entry), importer.getEntryFromPDFContent(firstPageContent, "\n")); | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
These comments are user facing. A user does not see
PdfContentImporter
in the UI. We are updating the docs at JabRef/user-documentation#537 - and the updated section should be linked.