Skip to content

Commit

Permalink
initial version
Browse files Browse the repository at this point in the history
  • Loading branch information
cbadenes committed Jun 23, 2023
1 parent 9e31e13 commit 8579a8f
Show file tree
Hide file tree
Showing 11 changed files with 309 additions and 3 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
__pycache__/
*.py[cod]
*$py.class

.DS_Store
# C extensions
*.so

Expand Down
58 changes: 56 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,56 @@
# claimer
Breaks down a textual paragraph into verifiable claims.
[![Tests](https://github.com/librairy/claimer/actions/workflows/tests.yml/badge.svg)](https://github.com/librairy/claimer/actions/workflows/tests.yml)
[![Downloads](https://static.pepy.tech/badge/claimer)](https://pepy.tech/project/claimer)
[![Current Release Version](https://img.shields.io/github/release/librairy/claimer.svg?style=flat-square&logo=github)](https://github.com/librairy/claimer/releases)
[![pypi Version](https://img.shields.io/pypi/v/claimer.svg?style=flat-square&logo=pypi&logoColor=white)](https://pypi.org/project/claimer/)
# librAIry Claimer

## Introduction

Claimer is our cutting-edge tool designed to revolutionize the process of analyzing textual paragraphs and extracting verifiable claims. By leveraging state-of-the-art techniques, we have created a method that breaks down complex text into individual statements that can be fact-checked and validated.

This powerful tool aims to enhance information comprehension, promote critical thinking, and support researchers, journalists, and fact-checkers in their pursuit of accurate and reliable information. With its advanced capabilities, our tool opens new avenues for assessing claims and fostering transparency in an ever-expanding digital landscape.

## Installation

To install the package, run:
```bash
pip install claimer
```

Afterwards, some resources must be downloaded. This can be either be done by manually calling

```bash
python -m spacy download en_core_web_md
python -m spacy_entity_linker "download_knowledge_base"
```

## Use

```python
from claimer import paragraph # version 3.5

# prepare a paragraph
text = "Face masks don’t work. Science in this area has evolved during the outbreak, the body of scientific evidence that has built up shows that the risk of transmission is made lower by wearing a face covering. The more we learn about COVID-19 the clearer it is that face coverings are an absolute vital tool in our fight against the virus. They effectively capture droplets, which is the main way the virus travels from person to person. According to the British Medical Association, if you don't wear it and have COVID-19, the risk of spreading it to others can be as high as 70%. If you do wear it, the risk drops to 5%. Make sure you wear it in all public indoor spaces and whenever you can't keep a 2m distance from others. Use a face covering is simple and easy way we can all stop the spread of the virus.",

# retrieve the claims
claims = paragraph.get_claims(text)

# iterates over claims
for i,claim in enumerate(claims):
print(f"{i}) {claim}")

# OUTPUT:
# 0) Face masks don’t work.
# 1) Science in this area has evolved during the outbreak, the body of scientific evidence that has built up shows that the risk of transmission is made lower by wearing a face covering.
# 2) The more we learn about COVID-19 the clearer it is that face coverings are an absolute vital tool in our fight against COVID-19 .
# 3) face coverings effectively capture droplets, which is the main way COVID-19 travels from person to person.
# 4) According to the British Medical Association, if you don't wear face coverings and have COVID-19 , the risk of spreading it to others can be as high as 70%.
# 5) If you do wear face coverings , the risk of spreading it to others drops to 5%.
# 6) Make sure you wear face coverings in all public indoor spaces and whenever you can't keep a 2m distance from others.
# 7) Use a face covering is simple and easy way we can all stop the spread of COVID-19

```

## Note

The Entity Linker at the current state is still experimental and should not be used in production mode.
Empty file added claimer/__init__.py
Empty file.
57 changes: 57 additions & 0 deletions claimer/paragraph.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
from claimer import parser

def get_claims(text):

(clusters, cluster_names) = parser.get_corefs(text)
#print(cluster_names)
tokens = parser.get_tokens(text)

cluster_labels = {}
for idx, cluster in enumerate(clusters):
position = 0
candidates = []
for (i,j) in cluster:
level = 99999
label = cluster_names[idx][position]
for x in range(i,j):
if x in tokens:
tokens[x]['cluster']=idx
t = tokens[x]
if (t['level'] < level):
level = t['level']
candidates.append((level,label))
candidates.sort(key = lambda x: x[0])
cluster_labels[idx] = candidates[0][1]
#print("Cluster Labels:",cluster_labels)


claims = []
claim = []
is_valid = False
current_cluster = -1
for k in tokens:
t = tokens[k]
if ('label' in t):
is_valid = True
if (t['text'] == '.'):
if (is_valid):
claims.append("".join(claim)+".")
claim = []
current_cluster = -1
is_valid = False
continue
if ('cluster' in t):
if (current_cluster == t['cluster']):
continue
# pos': 'PRON', 'tag': 'PRP$'
elif (t['pos'] == 'PRON') and (t['tag'] == 'PRP$'):
continue
current_cluster = t['cluster']
claim.append(cluster_labels[current_cluster]+" ")
else:
claim.append(t['text_with_ws'])
current_cluster = -1
if (len(claim)>0) and (is_valid):
claims.append("".join(claim)+".")
#print("Num claims:",len(claims))
return claims
89 changes: 89 additions & 0 deletions claimer/parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
#from fastcoref import FCoref
from fastcoref import LingMessCoref
from fastcoref import spacy_component
import spacy
import itertools

#model = FCoref(device='cuda:0')
#model = FCoref()

#model = LingMessCoref(device='cuda:0')
model = LingMessCoref()

# initialize language model
nlp = spacy.load("en_core_web_md")
# add coref pipeline
#nlp.add_pipe("fastcoref")
# add pipeline (declared through entry_points in setup.py)
nlp.add_pipe("entityLinker", last=True)

def get_corefs(input_text):
text = input_text.replace("\t","").replace("\n","")
preds = model.predict(texts=[text])

if (len(preds) == 0):
return text.split(".")

cluster_num = preds[0].get_clusters(as_strings=False)
# print("Num Clusters:", len(cluster_num))
# print("Initial Clusters:",cluster_num)

# clean clusters
cluster_spans = {}
for cluster in cluster_num:
for (i,j) in cluster:
size = j-i
for x in range(i,j):
if (x in cluster_spans):
if (size > (cluster_spans[x][1]-cluster_spans[x][0])):
cluster_spans[x] = (i,j)
else:
cluster_spans[x] = (i,j)


clusters = []
for cluster in cluster_num:
cluster_tokens = []
for (i,j) in cluster:
if (i in cluster_spans) and (cluster_spans[i] == (i,j)):
cluster_tokens.append((i,j))
if (len(cluster_tokens)>0):
clusters.append(cluster_tokens)
#print("Cluster Spans:", clusters)

labels = preds[0].get_clusters()
print("Cluster Tokens:",labels)

return (clusters,labels)

def get_tokens(text):
doc = nlp(text)

entities = {}
for e in doc._.linkedEntities:
span = e.get_span()
entity = {
'id' : e.get_id(),
'label' : e.get_label(),
'level' : len(e.get_sub_entities())
}
entities[span.start] = entity
#print("Entities:",entities)

tokens = {}
for id, t in enumerate(doc):
token = {
'text' : t.text,
'text_with_ws' : t.text_with_ws,
'pos' : t.pos_,
'tag' : t.tag_,
'level' : 99999
}
if (id in entities):
entity = entities[id]
token['level'] = entity['level']
token['label'] = entity['label']
tokens[t.idx] = token
#print("Tokens:",tokens)

return tokens
18 changes: 18 additions & 0 deletions makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
init:
pip install -r requirements.txt
python -m spacy download en_core_web_md
python -m spacy_entity_linker "download_knowledge_base"
pip install --upgrade build
pip install --upgrade twine

test:
pytest -v -s tests

build:
rm -rf dist/
python -m build
python -m twine upload --repository pypi dist/*

requirements:
pipreqs --savepath=requirements.in && pip-compile
rm requirements.in
35 changes: 35 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
[build-system]
requires = [
"setuptools==65.3.0",
"fastcoref",
"spacy",
"spacy-entity-linker"
]
build-backend = "setuptools.build_meta"

[project]
name = "claimer"
version = "0.0.1"
authors = [
{ name="Carlos Badenes-Olmedo", email="carlos.badenes@upm.es" },
]
description = "Breaks down a textual paragraph into verifiable claims."
readme = "README.md"
requires-python = ">=3.7"
classifiers = [
"Programming Language :: Python :: 3",
"License :: OSI Approved :: Apache Software License",
"Operating System :: OS Independent",
"Topic :: Internet :: WWW/HTTP :: Indexing/Search",
"Natural Language :: English",
"Topic :: Software Development :: Libraries",
]
dependencies = [
"fastcoref",
"spacy",
"spacy-entity-linker"
]

[project.urls]
"Homepage" = "https://github.com/librairy/claimer"
"Bug Tracker" = "https://github.com/librairy/claimer/issues"
3 changes: 3 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
fastcoref==2.1.6
spacy==3.5.3
spacy-entity-linker==1.0.3
33 changes: 33 additions & 0 deletions scripts/test_news.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import os
import sys
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))

from claimer import paragraph
import os

if __name__ == "__main__":
import json

input_dir = "../../data/fake_news_corpus_spanish/news"
output_dir = "../../data/fake_news_corpus_spanish/claims"

if not os.path.exists(output_dir):
os.makedirs(output_dir)

file = open(f"{input_dir}/full_date.json")
data = json.load(file)

counter = 0
for news in data:
print(f"[{counter}]processing news",news['id'])
counter += 1
facts = paragraph.get_claims(news['claim'])
news['facts'] = facts
json_path = f"{output_dir}/{news['id']}.json"
with open(json_path , "w") as outfile:
json_out = json.dumps(news, ensure_ascii=False, indent=4)
outfile.write(json_out)
print(json_out)
break

file.close()
Empty file added tests/__init__.py
Empty file.
17 changes: 17 additions & 0 deletions tests/test_sample.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from claimer import paragraph
import pytest

@pytest.fixture
def example_data():
return {
"id": "FK3",
"text": "Face masks don’t work. Science in this area has evolved during the outbreak, the body of scientific evidence that has built up shows that the risk of transmission is made lower by wearing a face covering. The more we learn about COVID-19 the clearer it is that face coverings are an absolute vital tool in our fight against the virus. They effectively capture droplets, which is the main way the virus travels from person to person. According to the British Medical Association, if you don't wear it and have COVID-19, the risk of spreading it to others can be as high as 70%. If you do wear it, the risk drops to 5%. Make sure you wear it in all public indoor spaces and whenever you can't keep a 2m distance from others. Use a face covering is simple and easy way we can all stop the spread of the virus.",
"source": "www.rebeccaharris.org"
}

def test_text1(example_data):
claims = paragraph.get_claims(example_data['text'])
print("Sample:",example_data)
for i,claim in enumerate(claims):
print(f"{i}) {claim}")
assert 8 == len(claims)

0 comments on commit 8579a8f

Please sign in to comment.