titipata · skuam · Jul 9, 2023 · Jul 10, 2023 · Jul 10, 2023 · Jul 10, 2023
diff --git a/README.md b/README.md
@@ -7,13 +7,18 @@ A Python parser for scientific PDF based on [GROBID](https://github.com/kermitt2
 Use `pip` to install from this Github repository
 
 ```bash
-pip install git+https://github.com/titipata/scipdf_parser
+pip install git+https://github.com/skuam/scipdf_parser
 ```
 
 **Note**
 * We also need an `en_core_web_sm` model for spacy, where you can run `python -m spacy download en_core_web_sm` to download it
 * You can change GROBID version in `serve_grobid.sh` to test the parser on a new GROBID version
 
+```bash
+python -m spacy download en_core_web_sm
+```
+
+
 ## Usage
 
 Run the GROBID using the given bash script before parsing PDF
@@ -26,39 +31,76 @@ This script will download GROBID and run the service at default port 8070 (see m
 To parse a PDF provided in `example_data` folder or direct URL, use the following function:
 
 ```python
-import scipdf
-article_dict = scipdf.parse_pdf_to_dict('example_data/futoma2017improved.pdf') # return dictionary
- 
-# option to parse directly from URL to PDF, if as_list is set to True, output 'text' of parsed section will be in a list of paragraphs instead
-article_dict = scipdf.parse_pdf_to_dict('https://www.biorxiv.org/content/biorxiv/early/2018/11/20/463760.full.pdf', as_list=False)
+import json
+from scipdf.parse_pdf import SciPDFParser
+from scipdf.models import Article
+
+parser = SciPDFParser()
 
+article:Article = parser.parse_pdf('https://www.biorxiv.org/content/biorxiv/early/2018/11/20/463760.full.pdf')
+
+print(json.dumps(article.dict(), indent = 4))
 # output example
->> {
-    'title': 'Proceedings of Machine Learning for Healthcare',
-    'abstract': '...',
-    'sections': [
-        {'heading': '...', 'text': '...'},
-        {'heading': '...', 'text': '...'},
-        ...
+{
+    "title": "A new method for measuring daytime sleepiness: the Epworth sleepiness scale.",
+    "authors": "Murray Johns",
+    "pub_date": "1991",
+    "abstract": "Text of abstract",
+    "sections": [
+        {
+            "heading": "Introduction",
+            "text": "Text of introduction",
+            "n_publication_ref": 1,
+            "n_figure_ref": 1
+        }
+    ],
+    "references": [
+        {
+            "title": "The Epworth Sleepiness Scale in Clinical Practice",
+            "journal": "Sleep Breath",
+            "year": "2017",
+            "authors": "Chervin RD, et al."
+        },
+        {
+            "title": "A new method for measuring daytime sleepiness: the Epworth sleepiness scale.",
+            "journal": "Sleep",
+            "year": "1991",
+            "authors": "Johns MW"
+        }
     ],
-    'references': [
-        {'title': '...', 'year': '...', 'journal': '...', 'author': '...'},
-        ...
+    "figures": [
+        {
+            "figure_label": "Figure 1",
+            "figure_type": "table",
+            "figure_id": "fig1",
+            "figure_caption": "Caption of figure 1",
+            "figure_data": "Data of figure 1"
+        }
     ],
-    'figures': [
-        {'figure_label': '...', 'figure_type': '...', 'figure_id': '...', 'figure_caption': '...', 'figure_data': '...'},
-        ...
+    "formulas": [
+        {
+            "formula_id": "f1",
+            "formula_text": "a^2 + b^2 = c^2",
+            "formula_coordinates": [
+                1,
+                2,
+                3,
+                4
+            ]
+        }
     ],
-    'doi': '...'
+    "doi": "10.1111/j.1365-2869.1991.tb00031.x"
 }
-
-xml = scipdf.parse_pdf('example_data/futoma2017improved.pdf', soup=True) # option to parse full XML from GROBID
 ```
 
+!!! Warning Parsing of figures is not supported yet in pydantic models, so you need to parse it manually. !!!
+
 To parse figures from PDF using [pdffigures2](https://github.com/allenai/pdffigures2), you can run
 
 ```python
-scipdf.parse_figures('example_data', output_folder='figures') # folder should contain only PDF files
+from scipdf.parse_pdf import SciPDFParser
+parser = SciPDFParser()
+parser.parse_figures('example_data', output_folder='figures') # folder should contain only PDF files
 ```
 
 You can see example output figures in `figures` folder.
diff --git a/example.py b/example.py
@@ -0,0 +1,9 @@
+import json
+from scipdf.parse_pdf import SciPDFParser
+from scipdf.models import Article
+
+if __name__ == '__main__':
+    parser = SciPDFParser()
+    article: Article = parser.parse_pdf('https://www.biorxiv.org/content/biorxiv/early/2018/11/20/463760.full.pdf')
+
+    print(json.dumps(article.dict(), indent=4))
diff --git a/requirements.txt b/requirements.txt
@@ -3,3 +3,4 @@ spacy
 pandas
 textstat
 beautifulsoup4
+pydantic
diff --git a/scipdf/__init__.py b/scipdf/__init__.py
@@ -1,6 +1,7 @@
-__version__ = "0.1dev"
+__version__ = "1.0.1"
 
 __all__ = ["pdf", "features"]
 
 from scipdf.features.text_utils import *
-from scipdf.pdf.parse_pdf import *
+from scipdf.models import *
+from scipdf.parse_pdf import *
diff --git a/scipdf/features/__init__.py b/scipdf/features/__init__.py
@@ -1,7 +1 @@
-from .text_utils import compute_readability_stats, compute_text_stats
-
-__all__ = [
-    "compute_readability_stats",
-    "compute_text_stats",
-    "compute_journal_features",
-]
+from .text_utils import compute_readability_stats, compute_text_stats , compute_journal_features