initial version

librairy · Jun 23, 2023 · 8579a8f · 8579a8f
1 parent 9e31e13
commit 8579a8f
Show file tree

Hide file tree

Showing 11 changed files with 309 additions and 3 deletions.
diff --git a/.gitignore b/.gitignore
@@ -2,7 +2,7 @@
 __pycache__/
 *.py[cod]
 *$py.class
-
+.DS_Store
 # C extensions
 *.so
 

diff --git a/README.md b/README.md
@@ -1,2 +1,56 @@
-# claimer
-Breaks down a textual paragraph into verifiable claims.
+[![Tests](https://github.com/librairy/claimer/actions/workflows/tests.yml/badge.svg)](https://github.com/librairy/claimer/actions/workflows/tests.yml)
+[![Downloads](https://static.pepy.tech/badge/claimer)](https://pepy.tech/project/claimer)
+[![Current Release Version](https://img.shields.io/github/release/librairy/claimer.svg?style=flat-square&logo=github)](https://github.com/librairy/claimer/releases)
+[![pypi Version](https://img.shields.io/pypi/v/claimer.svg?style=flat-square&logo=pypi&logoColor=white)](https://pypi.org/project/claimer/)
+# librAIry Claimer
+
+## Introduction
+
+Claimer is our cutting-edge tool designed to revolutionize the process of analyzing textual paragraphs and extracting verifiable claims. By leveraging state-of-the-art techniques, we have created a method that breaks down complex text into individual statements that can be fact-checked and validated. 
+
+This powerful tool aims to enhance information comprehension, promote critical thinking, and support researchers, journalists, and fact-checkers in their pursuit of accurate and reliable information. With its advanced capabilities, our tool opens new avenues for assessing claims and fostering transparency in an ever-expanding digital landscape.
+
+## Installation
+
+To install the package, run:
+```bash
+pip install claimer
+```
+
+Afterwards, some resources must be downloaded. This can be either be done by manually calling
+
+```bash
+python -m spacy  download  en_core_web_md
+python -m spacy_entity_linker "download_knowledge_base"
+```
+
+## Use
+
+```python
+from claimer import paragraph # version 3.5
+
+# prepare a paragraph
+text = "Face masks don’t work. Science in this area has evolved during the outbreak, the body of scientific evidence that has built up shows that the risk of transmission is made lower by wearing a face covering.  The more we learn about COVID-19 the clearer it is that face coverings are an absolute vital tool in our fight against the virus. They effectively capture droplets, which is the main way the virus travels from person to person. According to the British Medical Association, if you don't wear it and have COVID-19, the risk of spreading it to others can be as high as 70%. If you do wear it, the risk drops to 5%. Make sure you wear it in all public indoor spaces and whenever you can't keep a 2m distance from others. Use a face covering is simple and easy way we can all stop the spread of the virus.",
+
+# retrieve the claims
+claims = paragraph.get_claims(text)
+
+# iterates over claims 
+for i,claim in enumerate(claims):
+    print(f"{i}) {claim}")
+
+# OUTPUT:
+# 0) Face masks don’t work.
+# 1) Science in this area has evolved during the outbreak, the body of scientific evidence that has built up shows that the risk of transmission is made lower by wearing a face covering.
+# 2)  The more we learn about COVID-19 the clearer it is that face coverings are an absolute vital tool in our fight against COVID-19 .
+# 3) face coverings effectively capture droplets, which is the main way COVID-19 travels from person to person.
+# 4) According to the British Medical Association, if you don't wear face coverings and have COVID-19 , the risk of spreading it to others can be as high as 70%.
+# 5) If you do wear face coverings , the risk of spreading it to others drops to 5%.
+# 6) Make sure you wear face coverings in all public indoor spaces and whenever you can't keep a 2m distance from others.
+# 7) Use a face covering is simple and easy way we can all stop the spread of COVID-19
+
+```
+
+## Note
+
+The Entity Linker at the current state is still experimental and should not be used in production mode.
diff --git a/claimer/__init__.py b/claimer/__init__.py
diff --git a/claimer/paragraph.py b/claimer/paragraph.py
@@ -0,0 +1,57 @@
+from claimer import parser
+
+def get_claims(text):
+
+    (clusters, cluster_names) = parser.get_corefs(text)
+    #print(cluster_names)
+    tokens = parser.get_tokens(text)
+
+    cluster_labels = {}
+    for idx, cluster in enumerate(clusters):
+        position = 0
+        candidates = []
+        for (i,j) in cluster:
+            level = 99999
+            label = cluster_names[idx][position]
+            for x in range(i,j):
+                if x in tokens:
+                    tokens[x]['cluster']=idx
+                    t = tokens[x]
+                    if (t['level'] < level):
+                        level = t['level']
+            candidates.append((level,label))
+        candidates.sort(key = lambda x:  x[0])
+        cluster_labels[idx] = candidates[0][1]
+    #print("Cluster Labels:",cluster_labels)        
+
+
+    claims = []
+    claim  = []
+    is_valid = False
+    current_cluster = -1
+    for k in tokens:
+        t = tokens[k]
+        if ('label' in t):
+            is_valid = True
+        if (t['text'] == '.'):
+            if (is_valid):
+                claims.append("".join(claim)+".")
+            claim = []
+            current_cluster = -1
+            is_valid = False
+            continue
+        if ('cluster' in t):
+            if (current_cluster == t['cluster']):
+                continue
+            # pos': 'PRON', 'tag': 'PRP$'
+            elif (t['pos'] == 'PRON') and (t['tag'] == 'PRP$'):
+                continue
+            current_cluster = t['cluster']
+            claim.append(cluster_labels[current_cluster]+" ")
+        else:
+            claim.append(t['text_with_ws'])
+            current_cluster = -1
+    if (len(claim)>0) and (is_valid):        
+        claims.append("".join(claim)+".")
+    #print("Num claims:",len(claims))
+    return claims
diff --git a/claimer/parser.py b/claimer/parser.py
@@ -0,0 +1,89 @@
+#from fastcoref import FCoref
+from fastcoref import LingMessCoref
+from fastcoref import spacy_component
+import spacy
+import itertools
+
+#model = FCoref(device='cuda:0')
+#model = FCoref()
+
+#model = LingMessCoref(device='cuda:0')
+model = LingMessCoref()
+
+# initialize language model
+nlp = spacy.load("en_core_web_md")
+# add coref pipeline
+#nlp.add_pipe("fastcoref")
+# add pipeline (declared through entry_points in setup.py)
+nlp.add_pipe("entityLinker", last=True)
+
+def get_corefs(input_text):
+    text = input_text.replace("\t","").replace("\n","")
+    preds = model.predict(texts=[text])
+
+    if (len(preds) == 0):
+        return text.split(".")
+
+    cluster_num = preds[0].get_clusters(as_strings=False)
+    # print("Num Clusters:", len(cluster_num))
+    # print("Initial Clusters:",cluster_num)
+
+    # clean clusters
+    cluster_spans = {}
+    for cluster in cluster_num:
+        for (i,j) in cluster:
+            size = j-i
+            for x in range(i,j):
+                if (x in cluster_spans):
+                    if (size > (cluster_spans[x][1]-cluster_spans[x][0])):
+                        cluster_spans[x] = (i,j)
+                else:
+                    cluster_spans[x] = (i,j)
+
+
+    clusters = []
+    for cluster in cluster_num:
+        cluster_tokens = []
+        for (i,j) in cluster:
+            if (i in cluster_spans) and (cluster_spans[i] == (i,j)):
+                cluster_tokens.append((i,j))
+        if (len(cluster_tokens)>0):
+            clusters.append(cluster_tokens)
+    #print("Cluster Spans:", clusters)
+
+    labels = preds[0].get_clusters()
+    print("Cluster Tokens:",labels)
+
+    return (clusters,labels)
+
+def get_tokens(text):
+    doc = nlp(text)
+
+    entities = {}
+    for e in doc._.linkedEntities:
+        span = e.get_span()
+        entity = {
+            'id'    : e.get_id(),    
+            'label' : e.get_label(),
+            'level' : len(e.get_sub_entities())
+        }
+        entities[span.start] = entity
+    #print("Entities:",entities)
+
+    tokens = {}
+    for id, t in enumerate(doc):
+        token = {
+            'text' : t.text,
+            'text_with_ws' : t.text_with_ws,
+            'pos'  : t.pos_,
+            'tag'  : t.tag_,
+            'level' : 99999
+        }
+        if (id in entities):
+            entity = entities[id]
+            token['level'] = entity['level']        
+            token['label'] = entity['label']
+        tokens[t.idx] = token
+    #print("Tokens:",tokens)
+
+    return tokens
diff --git a/makefile b/makefile
@@ -0,0 +1,18 @@
+init:
+	pip install -r requirements.txt
+	python -m spacy download en_core_web_md
+	python -m spacy_entity_linker "download_knowledge_base"
+	pip install --upgrade build
+	pip install --upgrade twine
+
+test:
+	pytest -v -s tests
+
+build:
+	rm -rf dist/
+	python -m build
+	python -m twine upload --repository pypi dist/*
+
+requirements:
+	pipreqs --savepath=requirements.in && pip-compile
+	rm requirements.in
diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,35 @@
+[build-system]
+requires = [
+    "setuptools==65.3.0",
+    "fastcoref",
+    "spacy",
+    "spacy-entity-linker"
+]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "claimer"
+version = "0.0.1"
+authors = [
+  { name="Carlos Badenes-Olmedo", email="carlos.badenes@upm.es" },
+]
+description = "Breaks down a textual paragraph into verifiable claims."
+readme = "README.md"
+requires-python = ">=3.7"
+classifiers = [
+    "Programming Language :: Python :: 3",
+    "License :: OSI Approved :: Apache Software License",
+    "Operating System :: OS Independent",
+    "Topic :: Internet :: WWW/HTTP :: Indexing/Search",
+    "Natural Language :: English",
+    "Topic :: Software Development :: Libraries",
+]
+dependencies = [
+    "fastcoref",
+    "spacy",
+    "spacy-entity-linker"
+]
+
+[project.urls]
+"Homepage" = "https://github.com/librairy/claimer"
+"Bug Tracker" = "https://github.com/librairy/claimer/issues"
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,3 @@
+fastcoref==2.1.6
+spacy==3.5.3
+spacy-entity-linker==1.0.3
diff --git a/scripts/test_news.py b/scripts/test_news.py
@@ -0,0 +1,33 @@
+import os
+import sys
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+
+from claimer import paragraph
+import os
+
+if __name__ == "__main__":
+    import json
+
+    input_dir = "../../data/fake_news_corpus_spanish/news"
+    output_dir = "../../data/fake_news_corpus_spanish/claims"
+
+    if not os.path.exists(output_dir):    
+        os.makedirs(output_dir)
+
+    file = open(f"{input_dir}/full_date.json")
+    data = json.load(file)
+
+    counter = 0
+    for news in data:
+        print(f"[{counter}]processing news",news['id'])
+        counter += 1
+        facts = paragraph.get_claims(news['claim'])
+        news['facts'] = facts
+        json_path = f"{output_dir}/{news['id']}.json"
+        with open(json_path , "w") as outfile:
+            json_out = json.dumps(news, ensure_ascii=False, indent=4)
+            outfile.write(json_out)    
+            print(json_out)
+        break
+
+    file.close() 
diff --git a/tests/__init__.py b/tests/__init__.py
diff --git a/tests/test_sample.py b/tests/test_sample.py
@@ -0,0 +1,17 @@
+from claimer import paragraph
+import pytest
+
+@pytest.fixture
+def example_data():
+    return  {
+            "id": "FK3",
+            "text": "Face masks don’t work. Science in this area has evolved during the outbreak, the body of scientific evidence that has built up shows that the risk of transmission is made lower by wearing a face covering.  The more we learn about COVID-19 the clearer it is that face coverings are an absolute vital tool in our fight against the virus. They effectively capture droplets, which is the main way the virus travels from person to person. According to the British Medical Association, if you don't wear it and have COVID-19, the risk of spreading it to others can be as high as 70%. If you do wear it, the risk drops to 5%. Make sure you wear it in all public indoor spaces and whenever you can't keep a 2m distance from others. Use a face covering is simple and easy way we can all stop the spread of the virus.",
+            "source": "www.rebeccaharris.org"
+      }
+
+def test_text1(example_data):    
+    claims = paragraph.get_claims(example_data['text'])
+    print("Sample:",example_data)
+    for i,claim in enumerate(claims):
+        print(f"{i}) {claim}")
+    assert 8 == len(claims)