Skip to content

Commit

Permalink
Add bold to the focused word in context for readability (#47)
Browse files Browse the repository at this point in the history
* Fixed issue #44 : Added bold.

Warning : This change is breaking the former data model

* Removed flask migrate remnants

* Actually fixes issue #45 as well : moving to a proper click system

* Added tests for scripts

* Starting to implement client for importing data from CLI

* Working test builder

* Added export capacities

* Extracted corpus input conversion to avoid code duplication

* Added tests for the whole set of commands including corpus-list and corpus-dump

* Moved corpus-import to corpus-from-file
Created corpus-from-dir to easily reimport previous dump
Not tested the last one

* Finalizing wonderful at runtime generated tests for corpus-from-dir tests

* Nice link

* Quick link to last edit page
  • Loading branch information
PonteIneptique authored and Jean-Baptiste-Camps committed Apr 6, 2018
1 parent 566771d commit 238564b
Show file tree
Hide file tree
Showing 29 changed files with 1,626 additions and 506 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ Create a virtual environment, source it and run

```bash
pip install -r requirements.txt
python manage.py create_db
python manage.py db-create
```

## Run
Expand Down
6 changes: 5 additions & 1 deletion app/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,11 @@ def create_app(config_name="dev"):
static_folder=config[config_name].static_folder,
static_url_path="/statics"
)
app.config.from_object(config[config_name])
if not isinstance(config_name, str):
app.config.from_object(config)
else:
app.config.from_object(config[config_name])

app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False

config[config_name].init_app(app)
Expand Down
234 changes: 234 additions & 0 deletions app/cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,234 @@
import click
import os

from . import create_app, db
from .models import (
Corpus,
AllowedPOS,
AllowedLemma,
AllowedMorph,
WordToken
)
from .main.views.utils import create_input_format_convertion


app = None


DEFAULT_FILENAMES = {
"tokens": "tokens.csv",
"POS": "allowed_pos.txt",
"lemma": "allowed_lemma.txt",
"morph": "allowed_morph.csv"
}

def make_cli():
""" Creates a Command Line Interface for everydays tasks
:return: Click groum
"""
@click.group()
@click.option('--config', default="dev")
def cli(config):
""" Generates the client"""
click.echo("Loading the application")
global app
app = create_app(config)

@click.command("db-create")
def db_create():
""" Creates a local database
"""
with app.app_context():
db.create_all()
db.session.commit()
click.echo("Created the database")

@click.command("db-recreate")
def db_recreate():
""" Recreates a local database. You probably should not use this on
production.
"""
with app.app_context():
db.drop_all()
db.create_all()
db.session.commit()
click.echo("Dropped then recreated the database")

@click.command("db-fixtures")
def db_fixtures():
""" Loads demo/tests data to the database
"""
with app.app_context():
from tests.db_fixtures import add_corpus
add_corpus(
"wauchier", db, with_token=True, tokens_up_to=None,
with_allowed_lemma=True, partial_allowed_lemma=False,
with_allowed_pos=True, partial_allowed_pos=False,
with_allowed_morph=True)
add_corpus(
"floovant", db, with_token=True, tokens_up_to=None,
with_allowed_lemma=True, partial_allowed_lemma=False,
with_allowed_pos=True, partial_allowed_pos=False,
with_allowed_morph=True)
click.echo("Loaded fixtures to the database")

@click.command("run")
def run():
""" Run the application in Debug Mode [Not Recommended on production]
"""
app.run()

@click.command("corpus-from-file", help="Creates a corpus based on file."
"First parameter is the name")
@click.argument("name")
@click.option("--corpus", "tokens", type=click.File(), required=True,
help="Path of the file containing the pre-annotated corpus tokens")
@click.option("--lemma", "lemma_file", type=click.File(), help="Path of the file containing the Allowed Lemma")
@click.option("--POS", "POS_file", type=click.File(), help="Path of the file containing the Allowed POS")
@click.option("--morph", "morph_file", type=click.File(),
help="Path of the file containing the Allowed Morphological tags")
@click.option("--left", help="Number of words to keep on the left of each token")
@click.option("--right", help="Number of words to keep on the right of each token")
def corpus_ingest(
name, tokens,
lemma_file=None, POS_file=None, morph_file=None,
left=None, right=None):

if lemma_file is not None:
lemma_file = lemma_file.read()

if POS_file is not None:
POS_file = POS_file.read()

token_reader, lemma, morph, POS = create_input_format_convertion(
tokens, lemma_file, morph_file, POS_file
)

with app.app_context():
corpus = Corpus.create(
name,
word_tokens_dict=token_reader,
allowed_lemma=lemma,
allowed_POS=POS,
allowed_morph=morph,
context_left=left,
context_right=right
)
click.echo(
"Corpus created under the name {} with {} tokens".format(
name, corpus.tokens_count
)
)

@click.command("corpus-from-dir",
help="Create a corpus based on a folder. File with following names ({}) "
"should be in the folder. First parameter is the name".format(
", ".join(DEFAULT_FILENAMES.values())))
@click.argument("name")
@click.option("--path", type=click.Path(), required=True,
help="Path of the file containing the pre-annotated corpus tokens")
@click.option("--left", help="Number of words to keep on the left of each token")
@click.option("--right", help="Number of words to keep on the right of each token")
def corpus_import(name, path, left=None, right=None):
# Set the list of paths
token_path = os.path.join(path, DEFAULT_FILENAMES["tokens"])
morph_path = os.path.join(path, DEFAULT_FILENAMES["morph"])
lemma_path = os.path.join(path, DEFAULT_FILENAMES["lemma"])
pos_path = os.path.join(path, DEFAULT_FILENAMES["POS"])

# Set the default values
tokens, lemma, POS, morph = None, None, None, None

# If the token file does not exist, let's leave this city
if not os.path.isfile(token_path):
click.echo("Corpus not found")
return

tokens = open(token_path)
if os.path.isfile(morph_path):
morph = open(morph_path)
click.echo("-- Found Morphological Allowed Values")

if os.path.isfile(lemma_path):
with open(lemma_path) as file:
lemma = file.read()
click.echo("-- Found Lemma Allowed Values")

if os.path.isfile(pos_path):
with open(pos_path) as file:
POS = file.read()
click.echo("-- Found POS Allowed Values")

input_tokens, allowed_lemma, allowed_morph, allowed_POS = create_input_format_convertion(
tokens, lemma, morph, POS
)
with app.app_context():
data = Corpus.create(
name=name, word_tokens_dict=input_tokens,
allowed_lemma=allowed_lemma, allowed_morph=allowed_morph,
allowed_POS=allowed_POS, context_left=left,
context_right=right
)
click.echo("Corpus '{}' (ID : {}) created ".format(
name,
data.id
))

tokens.close()
if morph:
morph.close()

@click.command("corpus-list", help="Shows a list of corpus and their ID")
def corpus_list():
scheme = "{}\t| {}" # Could use a 0 filling to allow for a nicer output
with app.app_context():
click.echo(scheme.format("ID", "Name"))
for corpus in Corpus.query.all():
click.echo(scheme.format(corpus.id, corpus.name))

@click.command("corpus-dump", help="Dump corpus identified by {corpus} id. Use corpus-list to have a list of IDs")
@click.argument("corpus", type=click.INT)
@click.option("--path", type=click.Path(), required=True, help="Path where the corpus should be saved")
def corpus_dump(corpus, path):
with app.app_context():
if not os.path.exists(path):
os.makedirs(path)
corpus = Corpus.query.get(corpus)

# Check that the corpus exists
if not corpus:
click.echo("Corpus not found")
return

with open(os.path.join(path, DEFAULT_FILENAMES["tokens"]), "w") as file:
file.write(WordToken.to_input_format(
WordToken.query.filter(WordToken.corpus == corpus.id)
))
click.echo("--- Tokens dumped")
with open(os.path.join(path, DEFAULT_FILENAMES["lemma"]), "w") as file:
file.write(AllowedLemma.to_input_format(
AllowedLemma.query.filter(AllowedLemma.corpus == corpus.id)
))
click.echo("--- Allowed Lemma Values dumped")
with open(os.path.join(path, DEFAULT_FILENAMES["morph"]), "w") as file:
file.write(AllowedMorph.to_input_format(
AllowedMorph.query.filter(AllowedMorph.corpus == corpus.id)
))
click.echo("--- Allowed Morphological Values dumped")
with open(os.path.join(path, DEFAULT_FILENAMES["POS"]), "w") as file:
file.write(AllowedPOS.to_input_format(
AllowedPOS.query.filter(AllowedPOS.corpus == corpus.id)
))
click.echo("--- Allowed POS Values dumped")

cli.add_command(db_create)
cli.add_command(db_fixtures)
cli.add_command(db_recreate)
cli.add_command(run)
cli.add_command(corpus_ingest)
cli.add_command(corpus_import)
cli.add_command(corpus_dump)
cli.add_command(corpus_list)

return cli
22 changes: 8 additions & 14 deletions app/main/views/corpus.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from flask import request, jsonify, flash, redirect, url_for

from .utils import render_template_with_nav_info, format_api_like_reply
from .utils import render_template_with_nav_info, format_api_like_reply, create_input_format_convertion
from .. import main
from ...utils.tsv import StringDictReader
from werkzeug.exceptions import BadRequest
Expand All @@ -15,22 +15,16 @@ def corpus_new():
""" Register a new corpus
"""
if request.method == "POST":

allowed_lemma = request.form.get("allowed_lemma")
if allowed_lemma is not None:
allowed_lemma = [x.replace('\r', '') for x in allowed_lemma.split("\n") if len(x.replace('\r', '').strip()) > 0]

allowed_POS = request.form.get("allowed_POS")
if allowed_POS is not None:
allowed_POS = [x.replace('\r', '') for x in allowed_POS.split(",") if len(x.replace('\r', '').strip()) > 0]

allowed_morph = request.form.get("allowed_morph")
if allowed_morph is not None:
allowed_morph = list(StringDictReader(allowed_morph))
tokens, allowed_lemma, allowed_morph, allowed_POS = create_input_format_convertion(
request.form.get("tsv"),
request.form.get("allowed_lemma", None),
request.form.get("allowed_morph", None),
request.form.get("allowed_POS", None)
)

corpus = Corpus.create(
request.form.get("name"),
word_tokens_dict=StringDictReader(request.form.get("tsv")),
word_tokens_dict=tokens,
allowed_lemma=allowed_lemma,
allowed_POS=allowed_POS,
allowed_morph=allowed_morph,
Expand Down
36 changes: 36 additions & 0 deletions app/main/views/utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,41 @@
from ...models import Corpus
from ...utils.tsv import StringDictReader
from flask import render_template, request
from csv import DictReader


def create_input_format_convertion(tokens, allowed_lemma, allowed_morph, allowed_POS):
""" Convert input data into Corpus.create formats
:param tokens: Tokens for the corpus
:type tokens: str or _io.TextIOWrapper
:param allowed_lemma: Lemmas that will be allowed in the corpus
:type allowed_lemma: str or _io.TextIOWrapper
:param allowed_morph: Morphs that will be allowed in the corpus
:type allowed_morph: str or _io.TextIOWrapper
:param allowed_POS: POS that will be allowed in the corpus
:type allowed_POS: str
:return: Tokens, Allowed Lemma, Allowed Morph, Allowed POS
:rtype: (csv.DictReader, list(str), list(dict), list(str))
"""
if allowed_lemma is not None:
allowed_lemma = [x.replace('\r', '') for x in allowed_lemma.split("\n") if len(x.replace('\r', '').strip()) > 0]

if allowed_POS is not None:
allowed_POS = [x.replace('\r', '') for x in allowed_POS.split(",") if len(x.replace('\r', '').strip()) > 0]

if allowed_morph is not None:
if isinstance(allowed_morph, str):
allowed_morph = list(StringDictReader(allowed_morph))
else:
allowed_morph = list(DictReader(allowed_morph, dialect="excel-tab"))

if isinstance(tokens, str):
tokens = StringDictReader(tokens)
else:
tokens = DictReader(tokens, dialect="excel-tab")

return tokens, allowed_lemma, allowed_morph, allowed_POS


def render_template_with_nav_info(template, **kwargs):
Expand Down
Loading

0 comments on commit 238564b

Please sign in to comment.