Skip to content

Commit

Permalink
first commit
Browse files Browse the repository at this point in the history
  • Loading branch information
Elvis MBONING committed Jul 24, 2020
1 parent 6f48990 commit 3202fb5
Show file tree
Hide file tree
Showing 916 changed files with 44,506 additions and 0 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -127,3 +127,7 @@ dmypy.json

# Pyre type checker
.pyre/

# project
data/*.csv
data/*.txt
44 changes: 44 additions & 0 deletions Pipfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
[[source]]
name = "pypi"
url = "https://pypi.org/simple"
verify_ssl = true

[dev-packages]

[packages]
streamlit = "*"
pandas = "*"
nltk = "*"
matplotlib = "*"
gensim = "*"
wordcloud = "*"
sklearn = "*"
bokeh = "*"
keras = "*"
rasa = "*"
rasa-nlu="*"
rasa-sdk="*"
seaborn = "*"
pyLDAvis ="*"
IPython="*"
sklearn-crfsuite="*"
snips-nlu="*"
snips-nlu-en="*"
snips-nlu-metrics="*"
snips-nlu-parsers="*"
snips-nlu-utils="*"
tensor2tensor="*"
tensorboard="*"
tensorflow="*"
tensorflow-addons="*"
tensorflow-datasets="*"
tensorflow-estimator="*"
tensorflow-gan="*"
tensorflow-gpu="*"
tensorflow-hub="*"
tensorflow-metadata="*"
tensorflow-probability="*"
tensorflow-text="*"

[requires]
python_version = "3.6"
83 changes: 83 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,85 @@
# NLU-Co_SemEval-Task5-2020
Experimentations code used for SemEval-2020 Task 5: NLU/SVM based model apply to characterise and extract counterfactual items on raw data

# Resume

We try to solve the problem of classification of counterfactual statements and extraction of antecedents/consequences in raw data, by mobilizing on one hand Support Vector Machine (SVMs) and on the other hand Natural Language Understanding (NLU) infrastructures available on the market for conversational agents.

# How to run these experiments

## Subtask1: counterfactual classification

### Dev environnement

Please use pipenv to install dependencies

```
pipenv --python=3.6
pipenv shell
pipenv install
```

### SVM methods: sklearn experiments

- Train the model with this script

```python

python3 scripts/task1-train_damien.py

```
- Evaluate the model with this script

```python

python3 scripts/task1-label_damien.py

```

### NLU methods : Rasa and Snips experiments

- Train Rasa, Snips, sklearn and fastext model with this script (uncomment the line at the end)

```python

python3 scripts/task1-train_elvis.py

```
- Evaluate Rasa, Snips, sklearn and fastext model with this script

```python

python3 scripts/task1-label_elvis.py

```

## Subtask2: antecedent and consequent extraction

- Train Rasa and Snips model with this script (uncomment the line at the end)

```python

python3 scripts/task2-train_elvis.py

```
- Evaluate Rasa and Snips model with this script

```python

python3 scripts/task2-label_elvis.py

```

# Publication's reference competition

```
@inproceedings{yang-2020-semeval-task5,
title = "{S}em{E}val-2020 Task 5: Counterfactual Recognition",
author = "Yang, Xiaoyu and Obadinma, Stephen and Zhao, Huasha and Zhang, Qiong and Matwin, Stan and Zhu, Xiaodan",
booktitle = "Proceedings of the 14th International Workshop on Semantic Evaluation (SemEval-2020)",
year = "2020",
address = "Barcelona, Spain",
}
```
46 changes: 46 additions & 0 deletions configs/config_1/config_rasa_converrt.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
language: "en"

pipeline:
- name: ConveRTTokenizer
- name: ConveRTFeaturizer
- name: RegexFeaturizer
- name: CRFEntityExtractor
features: [
["low", "title"],
["bias", "suffix3"],
["upper", "pattern"]
#["dep", "ents", "cats"]
]
BILOU_flag: true
# This is the value given to sklearn_crfcuite.CRF tagger before training.
max_iterations: 300
L1_c: 0.0001
L2_c: 0.2
- name: DIETClassifier
epochs: 300
batch_strategy: "balanced"
learning_rate: 0.0001
use_masked_language_model: true
tensorboard_log_level: "epoch"
tensorboard_log_directory: "log"
drop_rate: 0.01
batch_size: [64, 128]
negative_margin_scale: 0.6
maximum_positive_similarity: 0.6
maximum_negative_similarity: -0.2
similarity_type: "inner"
embedding_dimension: 50
number_of_negative_examples: 20
share_hidden_layers: false
transformer_size: 128
number_of_transformer_layers: 3
number_of_attention_heads: 128
weight_sparsity: 0.8
entity_recognition: true
intent_classification: true
evaluate_every_number_of_epochs: 40
evaluate_on_number_of_examples: 500
random_seed: 10
hidden_layers_sizes:
text: [256]
label: [256]
35 changes: 35 additions & 0 deletions configs/config_1/config_rasa_mitie.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
language: "en"

pipeline:
- name: "MitieNLP"
# language model to load
model: "data/total_word_feature_extractor.dat"
- name: MitieTokenizer
# Flag to check whether to split intents
"intent_tokenization_flag": true
# Symbol on which intent should be split
"intent_split_symbol": "_"
- name: MitieFeaturizer
- name: CountVectorsFeaturizer
"analyzer": 'word' # use 'char' or 'char_wb' for character
"token_pattern": r'(?u)\b\w\w+\b'
"strip_accents": 'ascii' # {'ascii', 'unicode', None}
"stop_words": 'english' # string {'english'}, list, or None (default)
"min_df": 0.8 # float in range [0.0, 1.0] or int
"max_df": 1.0 # float in range [0.0, 1.0] or int
"min_ngram": 1 # int
"max_ngram": 2 # int
"max_features": null # int or None
"lowercase": true # bool
- name: RegexFeaturizer
- name: LexicalSyntacticFeaturizer
features: [
["low", "title", "upper"],
["BOS", "EOS", "low", "upper", "title", "digit"],
["low", "title", "upper"],
]
- name: MitieEntityExtractor
- name: MitieIntentClassifier



37 changes: 37 additions & 0 deletions configs/config_1/config_rasa_sklearn.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
language: "en"

pipeline:
- name: "MitieNLP"
# language model to load
model: "data/total_word_feature_extractor.dat"
cache_dir: null
- name: MitieTokenizer
- name: MitieFeaturizer
- name: RegexFeaturizer
#- name: "CRFEntityExtractor"
# features: [
# ["low", "title"],
# ["bias", "suffix3"],
# ["upper", "pos2"]
# #["dep", "ents", "cats"]
# ]
# BILOU_flag: true
# This is the value given to sklearn_crfcuite.CRF tagger before training.
# max_iterations: 300
# L1_c: 0.0001
# L2_c: 0.02
- name: "SklearnIntentClassifier"
num_threads: 8
n_jobs: 8
# This is used with the ``kernel`` hyperparameter in GridSearchCV.
C: [1, 2, 5, 10, 30, 100]
# Specifies the kernel to use with C-SVM.
kernels: ["linear"]
# Gamma parameter of the C-SVM.
"gamma": [0.1]
# We try to find a good number of cross folds to use during
"max_cross_validation_folds": 5
# Scoring function used for evaluating the hyper parameters.
"scoring_function": "f1_weighted"


71 changes: 71 additions & 0 deletions configs/config_1/config_rasa_spacy.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
language: "en"

pipeline:
- name: ConveRTTokenizer
- name: ConveRTFeaturizer
#- name: MitieFeaturizer
- name: RegexFeaturizer
- name: DIETClassifier
epochs: 500
batch_strategy: "balanced"
learning_rate: 0.0001
use_masked_language_model: true
tensorboard_log_level: "epoch"
tensorboard_log_directory: "log"
drop_rate: 0.01
batch_size: [64, 128]
negative_margin_scale: 0.6
maximum_positive_similarity: 0.6
maximum_negative_similarity: -0.2
similarity_type: "inner"
embedding_dimension: 50
number_of_negative_examples: 20
share_hidden_layers: false
transformer_size: 128
number_of_transformer_layers: 3
number_of_attention_heads: 128
weight_sparsity: 0.8
entity_recognition: false
intent_classification: true
evaluate_every_number_of_epochs: 40
evaluate_on_number_of_examples: 500
random_seed: 10
hidden_layers_sizes:
text: [256]
label: [256]

#- name: EmbeddingIntentClassifier
# nn architecture
#"hidden_layers_sizes_a": [256, 128]
#"hidden_layers_sizes_b": [256, 128]
#"hidden_layers_sizes_c": [128, 64]
#"hidden_layers_sizes_d": [128, 64]
#"hidden_layers_sizes_e": [64, 32]
#"hidden_layers_sizes_f": [64, 32]
#"batch_size": [64, 256]
#"epochs": 100
#"learning_rate": 0.1
# embedding parameters
#"embed_dim": 100
#"mu_pos": 0.6 # should be 0.0 < ... < 1.0 for 'cosine'
#"mu_neg": -0.2 # should be -1.0 < ... < 1.0 for 'cosine'
#"similarity_type": "inner" # string 'cosine' or 'inner'
#"num_neg": 20
#"use_max_sim_neg": true # flag which loss function to use
#"random_seed": 10 # set to any int to generate a reproducible training result
# regularization
#"C2": 0.0001
#"C_emb": 0.8
#"droprate": 0.01
#"ranking_length": 5
# flag for tokenizing intents
#"intent_tokenization_flag": false
#"intent_split_symbol": "_"
# visualization of accuracy
#"evaluate_every_num_epochs": 40 # small values may hurt performance
#"evaluate_on_num_examples": 1000 # large values may hurt performance
- name: "EntitySynonymMapper"

policies:
- name: EmbeddingPolicy
max_history: 10
28 changes: 28 additions & 0 deletions configs/config_1/config_rasa_whitespace.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
language: "en"

pipeline:
- name: "MitieNLP"
# language model to load
model: "data/total_word_feature_extractor.dat"
- name: WhitespaceTokenizer
- name: RegexFeaturizer
- name: LexicalSyntacticFeaturizer
- name: CountVectorsFeaturizer
# n-grams at the edges of words are padded with
"analyzer": 'word' # use 'char' or 'char_wb' for character
"strip_accents": 'ascii' # {'ascii', 'unicode', None}
"stop_words": 'english' # string {'english'}, list, or None (default)
"min_df": 1 # float in range [0.0, 1.0] or int
"max_df": 1.0 # float in range [0.0, 1.0] or int
"min_ngram": 1 # int
"max_ngram": 2 # int
"max_features": null # int or None
"lowercase": true # bool
- name: "KeywordIntentClassifier"
case_sensitive: True
- name: EntitySynonymMapper
- name: ResponseSelector

policies:
- name: EmbeddingPolicy
max_history: 10
15 changes: 15 additions & 0 deletions configs/config_2/config_rasa_sklearn.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
language: "en"

pipeline:
- name: SpacyNLP # loads the spacy language model
- name: SpacyTokenizer # splits the sentence into tokens
- name: SpacyEntityExtractor # uses the pretrained spacy NER model(Entity Extraction)
- name: RegexFeaturizer
- name: SpacyFeaturizer # creates sentence vector representations
- name: SklearnIntentClassifier # defines the classifier
num_threads: 32
n_jobs: 32
verbose: 10
random_seed: 10
return_train_score: false

Loading

0 comments on commit 3202fb5

Please sign in to comment.