first commit

Levis0045 · Jul 24, 2020 · 3202fb5 · 3202fb5
1 parent 6f48990
commit 3202fb5
Show file tree

Hide file tree

Showing 916 changed files with 44,506 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -127,3 +127,7 @@ dmypy.json
 
 # Pyre type checker
 .pyre/
+
+# project
+data/*.csv
+data/*.txt
diff --git a/Pipfile b/Pipfile
@@ -0,0 +1,44 @@
+[[source]]
+name = "pypi"
+url = "https://pypi.org/simple"
+verify_ssl = true
+
+[dev-packages]
+
+[packages]
+streamlit = "*"
+pandas = "*"
+nltk = "*"
+matplotlib = "*"
+gensim = "*"
+wordcloud = "*"
+sklearn = "*"
+bokeh = "*"
+keras = "*"
+rasa = "*"
+rasa-nlu="*"
+rasa-sdk="*"
+seaborn = "*"
+pyLDAvis ="*"
+IPython="*"
+sklearn-crfsuite="*"
+snips-nlu="*"
+snips-nlu-en="*"
+snips-nlu-metrics="*"
+snips-nlu-parsers="*"
+snips-nlu-utils="*"
+tensor2tensor="*"
+tensorboard="*"
+tensorflow="*"
+tensorflow-addons="*"
+tensorflow-datasets="*"
+tensorflow-estimator="*"
+tensorflow-gan="*"
+tensorflow-gpu="*"
+tensorflow-hub="*"
+tensorflow-metadata="*"
+tensorflow-probability="*"
+tensorflow-text="*"
+
+[requires]
+python_version = "3.6"
diff --git a/README.md b/README.md
@@ -1,2 +1,85 @@
 # NLU-Co_SemEval-Task5-2020
 Experimentations code used for SemEval-2020 Task 5: NLU/SVM based model apply to characterise and extract counterfactual items on raw data
+
+# Resume
+
+We try to solve the problem of classification of counterfactual statements and extraction of antecedents/consequences in raw data, by mobilizing on one hand Support Vector Machine (SVMs) and on the other hand Natural Language Understanding (NLU) infrastructures available on the market for conversational agents. 
+
+# How to run these experiments
+
+## Subtask1: counterfactual classification
+
+### Dev environnement
+
+Please use pipenv to install dependencies
+
+```
+pipenv --python=3.6
+pipenv shell
+pipenv install
+
+```
+
+### SVM methods: sklearn experiments
+
+- Train the model with this script
+
+```python
+
+python3 scripts/task1-train_damien.py
+
+```
+- Evaluate the model with this script
+
+```python
+
+python3 scripts/task1-label_damien.py
+
+```
+
+### NLU methods : Rasa and Snips experiments
+
+- Train Rasa, Snips, sklearn and fastext model with this script (uncomment the line at the end)
+
+```python
+
+python3 scripts/task1-train_elvis.py
+
+```
+- Evaluate Rasa, Snips, sklearn and fastext model with this script
+
+```python
+
+python3 scripts/task1-label_elvis.py
+
+```
+
+## Subtask2: antecedent and consequent extraction
+
+- Train Rasa and Snips model with this script (uncomment the line at the end)
+
+```python
+
+python3 scripts/task2-train_elvis.py
+
+```
+- Evaluate Rasa and Snips model with this script
+
+```python
+
+python3 scripts/task2-label_elvis.py
+
+```
+
+# Publication's reference competition
+
+```
+@inproceedings{yang-2020-semeval-task5,
+    title = "{S}em{E}val-2020 Task 5: Counterfactual Recognition",
+    author = "Yang, Xiaoyu and Obadinma, Stephen and Zhao, Huasha  and Zhang, Qiong and Matwin, Stan and Zhu, Xiaodan", 
+    booktitle = "Proceedings of the 14th International Workshop on Semantic Evaluation (SemEval-2020)",
+    year = "2020",
+    address = "Barcelona, Spain",
+}
+
+```
diff --git a/configs/config_1/config_rasa_converrt.yml b/configs/config_1/config_rasa_converrt.yml
@@ -0,0 +1,46 @@
+language: "en"
+
+pipeline:
+  - name: ConveRTTokenizer
+  - name: ConveRTFeaturizer
+  - name: RegexFeaturizer
+  - name: CRFEntityExtractor
+    features: [
+      ["low", "title"], 
+      ["bias", "suffix3"], 
+      ["upper", "pattern"]
+      #["dep", "ents", "cats"]
+    ]
+    BILOU_flag: true
+    # This is the value given to sklearn_crfcuite.CRF tagger before training.
+    max_iterations: 300
+    L1_c: 0.0001
+    L2_c: 0.2
+  - name: DIETClassifier
+    epochs: 300
+    batch_strategy: "balanced"
+    learning_rate: 0.0001
+    use_masked_language_model: true
+    tensorboard_log_level: "epoch"
+    tensorboard_log_directory: "log"
+    drop_rate: 0.01
+    batch_size: [64, 128]
+    negative_margin_scale: 0.6
+    maximum_positive_similarity: 0.6
+    maximum_negative_similarity: -0.2
+    similarity_type: "inner"
+    embedding_dimension: 50
+    number_of_negative_examples: 20
+    share_hidden_layers: false
+    transformer_size: 128
+    number_of_transformer_layers: 3
+    number_of_attention_heads: 128
+    weight_sparsity: 0.8
+    entity_recognition: true
+    intent_classification: true
+    evaluate_every_number_of_epochs: 40
+    evaluate_on_number_of_examples: 500
+    random_seed: 10
+    hidden_layers_sizes: 
+      text: [256]
+      label: [256]
diff --git a/configs/config_1/config_rasa_mitie.yml b/configs/config_1/config_rasa_mitie.yml
@@ -0,0 +1,35 @@
+language: "en"
+
+pipeline:
+- name: "MitieNLP"
+  # language model to load
+  model: "data/total_word_feature_extractor.dat"
+- name: MitieTokenizer
+  # Flag to check whether to split intents
+  "intent_tokenization_flag": true
+  # Symbol on which intent should be split
+  "intent_split_symbol": "_"
+- name: MitieFeaturizer
+- name: CountVectorsFeaturizer
+  "analyzer": 'word'  # use 'char' or 'char_wb' for character
+  "token_pattern": r'(?u)\b\w\w+\b'
+  "strip_accents": 'ascii'  # {'ascii', 'unicode', None}
+  "stop_words": 'english'  # string {'english'}, list, or None (default)
+  "min_df": 0.8  # float in range [0.0, 1.0] or int
+  "max_df": 1.0  # float in range [0.0, 1.0] or int
+  "min_ngram": 1  # int
+  "max_ngram": 2  # int
+  "max_features": null  # int or None
+  "lowercase": true  # bool
+- name: RegexFeaturizer
+- name: LexicalSyntacticFeaturizer
+  features: [
+    ["low", "title", "upper"],
+    ["BOS", "EOS", "low", "upper", "title", "digit"],
+    ["low", "title", "upper"],
+  ]
+- name: MitieEntityExtractor
+- name: MitieIntentClassifier
+
+
+
diff --git a/configs/config_1/config_rasa_sklearn.yml b/configs/config_1/config_rasa_sklearn.yml
@@ -0,0 +1,37 @@
+language: "en"
+
+pipeline:
+- name: "MitieNLP"
+  # language model to load
+  model: "data/total_word_feature_extractor.dat"
+  cache_dir: null
+- name: MitieTokenizer
+- name: MitieFeaturizer
+- name: RegexFeaturizer
+#- name: "CRFEntityExtractor"
+#  features: [
+#    ["low", "title"], 
+#    ["bias", "suffix3"], 
+#    ["upper", "pos2"]
+#    #["dep", "ents", "cats"]
+#  ]
+#  BILOU_flag: true
+  # This is the value given to sklearn_crfcuite.CRF tagger before training.
+#  max_iterations: 300
+#  L1_c: 0.0001
+#  L2_c: 0.02
+- name: "SklearnIntentClassifier"
+  num_threads: 8
+  n_jobs: 8
+  # This is used with the ``kernel`` hyperparameter in GridSearchCV.
+  C: [1, 2, 5, 10, 30, 100]
+  # Specifies the kernel to use with C-SVM.
+  kernels: ["linear"]
+  # Gamma parameter of the C-SVM.
+  "gamma": [0.1]
+  # We try to find a good number of cross folds to use during
+  "max_cross_validation_folds": 5
+  # Scoring function used for evaluating the hyper parameters.
+  "scoring_function": "f1_weighted"
+
+
diff --git a/configs/config_1/config_rasa_spacy.yml b/configs/config_1/config_rasa_spacy.yml
@@ -0,0 +1,71 @@
+language: "en"
+
+pipeline:
+- name: ConveRTTokenizer
+- name: ConveRTFeaturizer
+#- name: MitieFeaturizer
+- name: RegexFeaturizer
+- name: DIETClassifier
+  epochs: 500
+  batch_strategy: "balanced"
+  learning_rate: 0.0001
+  use_masked_language_model: true
+  tensorboard_log_level: "epoch"
+  tensorboard_log_directory: "log"
+  drop_rate: 0.01
+  batch_size: [64, 128]
+  negative_margin_scale: 0.6
+  maximum_positive_similarity: 0.6
+  maximum_negative_similarity: -0.2
+  similarity_type: "inner"
+  embedding_dimension: 50
+  number_of_negative_examples: 20
+  share_hidden_layers: false
+  transformer_size: 128
+  number_of_transformer_layers: 3
+  number_of_attention_heads: 128
+  weight_sparsity: 0.8
+  entity_recognition: false
+  intent_classification: true
+  evaluate_every_number_of_epochs: 40
+  evaluate_on_number_of_examples: 500
+  random_seed: 10
+  hidden_layers_sizes: 
+    text: [256]
+    label: [256]
+
+#- name: EmbeddingIntentClassifier
+  # nn architecture
+  #"hidden_layers_sizes_a": [256, 128]
+  #"hidden_layers_sizes_b": [256, 128]
+  #"hidden_layers_sizes_c": [128, 64]
+  #"hidden_layers_sizes_d": [128, 64]  
+  #"hidden_layers_sizes_e": [64, 32]  
+  #"hidden_layers_sizes_f": [64, 32]
+  #"batch_size": [64, 256]
+  #"epochs": 100
+  #"learning_rate": 0.1
+  # embedding parameters
+  #"embed_dim": 100
+  #"mu_pos": 0.6  # should be 0.0 < ... < 1.0 for 'cosine'
+  #"mu_neg": -0.2  # should be -1.0 < ... < 1.0 for 'cosine'
+  #"similarity_type": "inner"  # string 'cosine' or 'inner'
+  #"num_neg": 20
+  #"use_max_sim_neg": true  # flag which loss function to use
+  #"random_seed": 10 # set to any int to generate a reproducible training result
+  # regularization
+  #"C2": 0.0001
+  #"C_emb": 0.8
+  #"droprate": 0.01
+  #"ranking_length": 5
+  # flag for tokenizing intents
+  #"intent_tokenization_flag": false
+  #"intent_split_symbol": "_"
+  # visualization of accuracy
+  #"evaluate_every_num_epochs": 40  # small values may hurt performance
+  #"evaluate_on_num_examples": 1000  # large values may hurt performance
+- name: "EntitySynonymMapper"
+
+policies:
+  - name: EmbeddingPolicy
+    max_history: 10
diff --git a/configs/config_1/config_rasa_whitespace.yml b/configs/config_1/config_rasa_whitespace.yml
@@ -0,0 +1,28 @@
+language: "en"
+
+pipeline:
+- name: "MitieNLP"
+  # language model to load
+  model: "data/total_word_feature_extractor.dat"
+- name: WhitespaceTokenizer
+- name: RegexFeaturizer
+- name: LexicalSyntacticFeaturizer
+- name: CountVectorsFeaturizer
+  # n-grams at the edges of words are padded with
+  "analyzer": 'word'  # use 'char' or 'char_wb' for character
+  "strip_accents": 'ascii'  # {'ascii', 'unicode', None}
+  "stop_words": 'english'  # string {'english'}, list, or None (default)
+  "min_df": 1  # float in range [0.0, 1.0] or int
+  "max_df": 1.0  # float in range [0.0, 1.0] or int
+  "min_ngram": 1  # int
+  "max_ngram": 2  # int
+  "max_features": null  # int or None
+  "lowercase": true  # bool
+- name: "KeywordIntentClassifier"
+  case_sensitive: True
+- name: EntitySynonymMapper
+- name: ResponseSelector
+
+policies:
+  - name: EmbeddingPolicy
+    max_history: 10
diff --git a/configs/config_2/config_rasa_sklearn.yml b/configs/config_2/config_rasa_sklearn.yml
@@ -0,0 +1,15 @@
+language: "en"
+
+pipeline:
+- name: SpacyNLP                    # loads the spacy language model
+- name: SpacyTokenizer              # splits the sentence into tokens
+- name: SpacyEntityExtractor        # uses the pretrained spacy NER model(Entity Extraction)
+- name: RegexFeaturizer
+- name: SpacyFeaturizer             # creates sentence vector representations
+- name: SklearnIntentClassifier     # defines the classifier
+  num_threads: 32
+  n_jobs: 32
+  verbose: 10
+  random_seed: 10
+  return_train_score: false
+