diff --git a/.travis.yml b/.travis.yml index 0134267..f0732ba 100644 --- a/.travis.yml +++ b/.travis.yml @@ -7,7 +7,6 @@ matrix: - python: "3.8" before_install: - export BOTO_CONFIG=/dev/null # See https://github.com/travis-ci/travis-ci/issues/7940 - - export BING_SEARCH_API_KEY=${BING_SEARCH_API_KEY:=secret} # Fallback to 'secret' to avoid empty value - export PYTHONHASHSEED=500 # Make word2vec deterministic script: - pylint -E acres && py.test --cov=acres diff --git a/acres/evaluation/__init__.py b/acres/evaluation/__init__.py index 778dce8..e2f16a9 100644 --- a/acres/evaluation/__init__.py +++ b/acres/evaluation/__init__.py @@ -1,6 +1,6 @@ """ Package containing evaluation modules. """ -from acres.evaluation import corpus, evaluation, metrics +from acres.evaluation import evaluation, metrics -__all__ = ['corpus', 'evaluation', 'metrics'] +__all__ = ['evaluation', 'metrics'] diff --git a/acres/evaluation/corpus.py b/acres/evaluation/corpus.py deleted file mode 100755 index eed81fc..0000000 --- a/acres/evaluation/corpus.py +++ /dev/null @@ -1,160 +0,0 @@ -""" -Module to debug expansion candidates on a corpus (typically the training set). - -.. deprecated:: 0.1 - Corpus-based evaluation has not been used recently (e.g. not used in Michel's PhD Thesis). - -.. codeauthor:: Stefan Schulz -""" - -import logging -import math -import re -from typing import Dict, List, Tuple - -from acres import constants -from acres.ngram import finder -from acres.preprocess import resource_factory -from acres.rater import rater -from acres.util import acronym as acro_util -from acres.web import base - -logger = logging.getLogger(__name__) - -VERBOSE = False -DIV = 1 # for sampling, if no sampling DIV = 1. Sampling is used for testing - - -def find_synonyms() -> None: - """ - TODO: this routine was originally intended to process a large list of acronyms + contexts - Finds synonyms using a n-gram frequency list from related corpus. - TODO: reformatting logfile and filter criteria - - :return: - """ - - # ngrams that contain at least one acronym - acronym_ngrams = resource_factory.get_acronym_ngrams() - - # dictionary from which the logfile is generated - d_log_corpus = {} # type: Dict[str, List[str]] - - # dictionary from which the logfile is generated - d_log_web = {} # type: Dict[str, List[str]] - - logger.debug("Dumps loaded") - - count = 0 - for ngram in acronym_ngrams: # language model, filtered by ngrams containing acronyms - count = count + 1 - if count % 1000 == 0: - # time.sleep(10) - logger.info(count) - # and ngram.count(" ") < 3: - if not ngram.isupper() and constants.LINE_BREAK not in ngram and count % DIV == 0: - # ngrams with newlines substitutes ("¶") seemed to be useless for - # this purpose - - logger.debug("-----------------------") - logger.debug(ngram) - splits = acro_util.split_ngram(ngram.strip()) - for split in splits: - left_string = split[0].strip() - acronym = split[1].strip() - right_string = split[2].strip() - # Parms: min_win_size, minfreq, maxcount, min_number_tokens, max_number_tokens - # Set Parameters - - if len(acronym) == 2: - constraints = finder.FinderConstraints(min_freq=2, max_count=50, - min_num_tokens=1, - max_num_tokens=4 + ngram.count(" ")) - else: - constraints = finder.FinderConstraints(min_freq=2, max_count=50, - min_num_tokens=2, - max_num_tokens=4 + ngram.count(" ")) - - # prepare parameters for Web model - if constants.DIGIT_MARKER in ngram: - li_web = [] # type: List[Tuple[int, str]] - else: - query = left_string + " " + acronym + " " + right_string - - # TODO use text.replace_punctuation instead - query = query.replace(".", " ").replace(",", " ") - - li_web = base.ngrams_web_dump("\"" + query + "\"", 1, 10) - - # Prepare parameters for corpus model - # TODO potentially broken - if left_string == "": - left_string = "*" - if right_string == "": - right_string = "*" - li_corpus = finder.find_embeddings(left_string, acronym, right_string, constraints) - - _process_corpus(li_corpus, acronym, ngram, d_log_corpus) - _process_corpus(li_web, acronym, ngram, d_log_web) - - # "Logs" are files with short form expansions - # logCorpus: expansions based on ngram model - # logWebs: expansions from Web mining - _write_log(d_log_corpus, resource_factory.get_log_corpus_filename()) - _write_log(d_log_web, resource_factory.get_log_web_filename()) - - -def _process_corpus(corpus: List[Tuple[int, str]], acronym: str, ngram: str, - log: Dict[str, List[str]]) -> None: - """ - @todo return log instead of receiving it via parameter - - :param corpus: - :param acronym: - :param ngram: - :param log: - :return: - """ - # morphemes = resource_factory.get_morphemes() - # TODO: as morphemes are not a public resource, maybe ignore them at least in our experiments - - for item in corpus: - old_exp = "" - (freq, exp) = item # Frequency, Ngram expression - - first_condition = re.search(r"^[\s\-\w]*$", exp) is not None - second_condition = acronym.lower() != exp.lower()[0:len(acronym.lower())] - if first_condition and second_condition: - if exp != old_exp: - # score_corpus = 0 - (_, score_corpus) = rater.get_acro_def_pair_score(acronym, exp) - if score_corpus > 0: - log_score = str(round(score_corpus * math.log10(freq), 2)) - score = str(round(score_corpus, 2)) - result = log_score + " " + exp + " " + score + " " + str(freq) + " \t" + ngram - if acronym not in log: - log[acronym] = [result] - else: - log[acronym].append(result) - old_exp = exp - - -def _write_log(log: Dict[str, List[str]], filename: str) -> None: - """ - Writes a log into a file described by the filename. - - :param log: - :param filename: - :return: - """ - file = open(filename, "w", encoding="UTF-8") - - for acronym in log: - for result in log[acronym]: - file.write(acronym.rjust(8) + "\t" + result + "\n") - - file.close() - - -if __name__ == "__main__": - find_synonyms() diff --git a/acres/evaluation/evaluation.py b/acres/evaluation/evaluation.py index 4b5f458..8bef8f2 100644 --- a/acres/evaluation/evaluation.py +++ b/acres/evaluation/evaluation.py @@ -9,7 +9,6 @@ from itertools import islice from typing import Dict, Tuple, List, Set -import acres.util.acronym from acres.evaluation import metrics from acres.model import expansion_standard, detection_standard, topic_list from acres.resolution import resolver @@ -52,7 +51,7 @@ def test_input(true_expansions: Set[str], possible_expansions: List[str], return False -def analyze(contextualized_acronym: acres.util.acronym.Acronym, true_expansions: Set[str], +def analyze(contextualized_acronym: Acronym, true_expansions: Set[str], strategy: resolver.Strategy, max_tries: int) -> Dict[str, bool]: """ Analyze a given row of the gold standard. @@ -75,7 +74,7 @@ def analyze(contextualized_acronym: acres.util.acronym.Acronym, true_expansions: # Remove context to improve cache hit # XXX We currently support context only for n-grams - if strategy not in [resolver.Strategy.NGRAM, resolver.Strategy.FASTNGRAM]: + if strategy != resolver.Strategy.FASTNGRAM: left_context = "" right_context = "" diff --git a/acres/fastngram/fastngram.py b/acres/fastngram/fastngram.py index 89bdc2f..6dcb3a3 100644 --- a/acres/fastngram/fastngram.py +++ b/acres/fastngram/fastngram.py @@ -7,11 +7,10 @@ from collections import OrderedDict from typing import Dict, Set, Tuple, Iterator, List, Union -import acres.util.acronym -from acres.model import topic_list from acres.preprocess import resource_factory from acres.util import functions from acres.util.functions import import_conf +from acres.util.acronym import Acronym logger = logging.getLogger(__name__) @@ -118,7 +117,7 @@ def fastngram(acronym: str, left_context: str = "", right_context: str = "", :param max_rank: :return: """ - contextualized_acronym = acres.util.acronym.Acronym(acronym=acronym, left_context=left_context, + contextualized_acronym = Acronym(acronym=acronym, left_context=left_context, right_context=right_context) contexts = _generate_acronym_contexts(contextualized_acronym) @@ -145,7 +144,7 @@ def fasttype(acronym: str, left_context: str = "", right_context: str = "", yield ngram -def _find_contexts(acronym: str, min_freq: int) -> 'List[topic_list.Acronym]': +def _find_contexts(acronym: str, min_freq: int) -> 'List[Acronym]': """ Find contexts in the training data where this acronym appears. @@ -155,7 +154,7 @@ def _find_contexts(acronym: str, min_freq: int) -> 'List[topic_list.Acronym]': """ model = resource_factory.get_center_map(functions.partition(acronym, PARTITIONS)) - all_contexts = [] # type: List[topic_list.Acronym] + all_contexts = [] # type: List[Acronym] for out_freq, contexts in model.contexts(acronym).items(): for left, right in contexts: # Do not allow empty contexts. @@ -163,14 +162,14 @@ def _find_contexts(acronym: str, min_freq: int) -> 'List[topic_list.Acronym]': continue if out_freq < min_freq: break - contextualized_acronym = acres.util.acronym.Acronym(acronym=acronym, left_context=left, + contextualized_acronym = Acronym(acronym=acronym, left_context=left, right_context=right) all_contexts.append(contextualized_acronym) return all_contexts -def _center_provider(contexts: 'List[topic_list.Acronym]', min_freq: int, +def _center_provider(contexts: 'List[Acronym]', min_freq: int, max_rank: int) -> Iterator[str]: """ Provide unlimited center words for a given list of contexts. @@ -227,7 +226,7 @@ def create_map(ngrams: Dict[str, int], model: Union[ContextMap, CenterMap], return model -def _generate_ngram_contexts(ngram: str) -> 'List[topic_list.Acronym]': +def _generate_ngram_contexts(ngram: str) -> 'List[Acronym]': """ Generate a list of contextualized n-grams with a decreasing central n-gram and increasing \ lateral context. @@ -252,12 +251,12 @@ def _generate_ngram_contexts(ngram: str) -> 'List[topic_list.Acronym]': left = sys.intern(" ".join(tokens[0:i])) right = sys.intern(" ".join(tokens[j:ngram_size])) center = sys.intern(" ".join(tokens[i:j])) - contexts.append(acres.util.acronym.Acronym(acronym=center, left_context=left, + contexts.append(Acronym(acronym=center, left_context=left, right_context=right)) return contexts -def _generate_acronym_contexts(contextualized_acronym: 'topic_list.Acronym') -> 'List[topic_list.Acronym]': +def _generate_acronym_contexts(contextualized_acronym: 'Acronym') -> 'List[Acronym]': """ Generate a list of contextualized acronyms with decreasing lateral context. @@ -279,7 +278,7 @@ def _generate_acronym_contexts(contextualized_acronym: 'topic_list.Acronym') -> if right_length > left_length: max_length += min(MAX_DIFF, right_length - left_length) - contexts = [] # type: List[topic_list.Acronym] + contexts = [] # type: List[Acronym] for j in range(max_length, -1, -1): # Left size > right size if j > right_length: @@ -293,7 +292,7 @@ def _generate_acronym_contexts(contextualized_acronym: 'topic_list.Acronym') -> continue left_context = " ".join(left[i:left_length]) right_context = " ".join(right[0:j]) - contexts.append(acres.util.acronym.Acronym(acronym=contextualized_acronym.acronym, + contexts.append(Acronym(acronym=contextualized_acronym.acronym, left_context=left_context, right_context=right_context)) return contexts diff --git a/acres/model/detection_standard.py b/acres/model/detection_standard.py index 5225979..688e05d 100644 --- a/acres/model/detection_standard.py +++ b/acres/model/detection_standard.py @@ -11,6 +11,7 @@ from acres.model import topic_list from acres.util import acronym as acro_util +from acres.util.acronym import Acronym logger = logging.getLogger(__name__) @@ -77,7 +78,7 @@ def parse_valid(filename: str) -> Set[str]: return filter_valid(parse(filename)) -def update(previous: Dict[str, bool], acronyms: List[acro_util.Acronym]) -> Dict[str, bool]: +def update(previous: Dict[str, bool], acronyms: List[Acronym]) -> Dict[str, bool]: """ Update a previous detection standard with new acronyms from a topic list, preserving order. diff --git a/acres/ngram/ngrams.py b/acres/model/ngrams.py similarity index 98% rename from acres/ngram/ngrams.py rename to acres/model/ngrams.py index e2c727c..5a0dea1 100644 --- a/acres/ngram/ngrams.py +++ b/acres/model/ngrams.py @@ -18,7 +18,6 @@ class FilteredNGramStat: @todo ngramstat itself should be a generator """ - NGRAM_SEPARATOR = "\t" TOKEN_SEPARATOR = " " PRINT_INTERVAL = 1000000 diff --git a/acres/model/topic_list.py b/acres/model/topic_list.py index aea74ae..933b5c2 100644 --- a/acres/model/topic_list.py +++ b/acres/model/topic_list.py @@ -7,7 +7,7 @@ from operator import attrgetter from typing import List, Set -from acres.ngram import ngrams +from acres.model import ngrams from acres.util import acronym as acro_util from acres.util import functions diff --git a/acres/ngram/__init__.py b/acres/ngram/__init__.py deleted file mode 100644 index b784eec..0000000 --- a/acres/ngram/__init__.py +++ /dev/null @@ -1,11 +0,0 @@ -""" -.. deprecated:: 0.1 - Use `fastngram` instead. - -Package grouping modules related to the n-gram expansion strategy as originally implemented by -Prof. Stefan Schulz. Since this implementation relies on regular expressions, it typically runs -slower than `fastngram`. -""" -from acres.ngram import finder, ngrams - -__all__ = ['finder', 'ngrams'] diff --git a/acres/ngram/finder.py b/acres/ngram/finder.py deleted file mode 100755 index 4435e38..0000000 --- a/acres/ngram/finder.py +++ /dev/null @@ -1,329 +0,0 @@ -""" -Finds synonyms using a n-gram frequency list from related corpus. - -.. codeauthor:: Stefan Schulz -""" - -import logging -import re -from collections import namedtuple -from typing import List, Tuple, Set, Pattern, AnyStr, Iterator - -from acres import constants -from acres.preprocess import resource_factory -from acres.util import functions -from acres.util import text - -logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) - -FinderConstraints = namedtuple('FinderConstraints', ['min_freq', 'max_count', 'min_num_tokens', - 'max_num_tokens']) - - -def _build_search_ngrams(context: str, reverse: bool = False) -> Tuple[str, str, str]: - """ - Builds a context tuple containing 1 to n-grams - - :param context: A string with the context - :param reverse: Takes the context in reverse (e.g. left context) - :return: A tuple with 1 to n-gram - """ - unigram = bigram = trigram = "" - if context != "": - unigram = text.context_ngram(context, 1, reverse) - bigram = text.context_ngram(context, 2, reverse) - trigram = text.context_ngram(context, 3, reverse) - return unigram, bigram, trigram - - -def _strip_frequencies(embeddings: List[Tuple[int, str]], min_freq: int = 0) -> List[str]: - """ - Strip out frequencies from a given embedding list obtained via find_embeddings. - - :param embeddings: A list of embeddings in the format freq\tembedding. - :param min_freq: Minimum frequency to be used (defaults to 0). - :return: A list of embeddings containing only strings with a minimum frequency. - """ - ret = [] - for embedding in embeddings: - (freq, ngram) = embedding - if freq >= min_freq: - ret.append(ngram) - return ret - - -def robust_find_embeddings(acronym: str, left_context: str, right_context: str) -> Iterator[str]: - """ - Generates several search patterns and returns the first found embeddings. - - @todo integrate with logic from find_synonyms()? - - :param acronym: - :param left_context: - :param right_context: - :return: - """ - (left_unigram, left_bigram, left_trigram) = _build_search_ngrams(left_context, True) - (right_unigram, right_bigram, right_trigram) = _build_search_ngrams(right_context) - - # Order is important for the quality of the retrieved expansion - patterns = [(left_trigram, right_trigram), # trigrams - (left_bigram, right_trigram), (left_trigram, right_bigram), # bigram + trigram - (left_bigram, right_bigram), # bigrams - (left_unigram, right_bigram), (left_bigram, right_unigram), # bigram + unigram - (left_unigram, right_unigram), # unigrams - (left_bigram, ""), (left_unigram, ""), # bigram/unigram + - ("", right_bigram), ("", right_unigram), # + bigram/unigram - ("", ""), # + - ("", ""), ("", "")] # + - - finder_constraints = FinderConstraints(min_freq=1, max_count=500, min_num_tokens=2, - max_num_tokens=10) - previous_left_pattern = previous_right_pattern = "" - for pattern in patterns: - (left_pattern, right_pattern) = pattern - logger.debug("%s | %s", left_pattern, right_pattern) - - # Quick optimization: don't search for patterns that happens to be the same as last one - if left_pattern != previous_left_pattern or right_pattern != previous_right_pattern: - embeddings = find_embeddings(left_pattern, acronym, right_pattern, finder_constraints) - stripped_embeddings = _strip_frequencies(embeddings) - for emb in stripped_embeddings: - yield emb - - previous_left_pattern = left_pattern - previous_right_pattern = right_pattern - - return "" - - -def find_embeddings(str_left: str, str_middle: str, str_right: str, - finder_constraints: FinderConstraints) -> List[Tuple[int, str]]: - """ - Input str_middle, together with a series of filter parameters - Three cases of embeddings: 1. bilateral, 2.left, 3.right - - :param str_left: string left of unknown ("" if to be retrieved ; "" if empty) - :param str_middle: input nonlex form (with or without context words) for which synonym is sought - :param str_right: string right uf unknown ("" if to be retrieved ; "" if empty") - :param finder_constraints: - :return: - """ - logger.debug("Minimum n-gram frequency: %d", finder_constraints.min_freq) - logger.debug("Maximum count of iterations: %d", finder_constraints.max_count) - logger.debug("N-gram cardinality between %d and %d", - finder_constraints.min_num_tokens, finder_constraints.max_num_tokens) - - # set of selected ngrams for filtering - if str_left == "" and str_right == "": - logger.debug("No filter. Return empty list") - return [] - - sel_rows = _build_sel_rows(str_left, str_middle, str_right) - - regex_embed = _build_regex(str_left, str_middle, str_right) - - all_beds = _build_all_beds(sel_rows, regex_embed, finder_constraints) - - count = len(all_beds) - - # random selection of hits, to avoid explosion - sel_beds = functions.random_sub_list(all_beds, count) - - # if logger.getEffectiveLevel() == logging.DEBUG: - # _debug_embeddings(all_beds) - - if count <= 0: - return [] - - max_num = (finder_constraints.max_count // count) + 3 - logger.debug("Per matching n-gram %d surroundings", max_num) - return _find_middle(str_middle, sel_beds, max_num) - - -def _build_regex(str_left: str, str_middle: str, str_right: str) -> Pattern[AnyStr]: - """ - - :param str_left: - :param str_middle: - :param str_right: - :return: - """ - # Construction of regular expression - # Left Middle Right - # VOID "^" "$" - # Specified ("abc") "^abc\ " "abc" "\ abc$" - # SELECTED "^.*\ " "\ .*$" - # Build regular expression for matching ngram - - # Three - - str_middle_esc = re.escape(str_middle.strip()) - - if str_left == "": - str_left_esc = r"^.*\ " - elif str_left == "": - str_left_esc = "^" - else: - str_left_esc = "^" + re.escape(str_left.strip()) + r"\ " - - if str_right == "": - str_right_esc = r"\ .*$" - elif str_right == "": - str_right_esc = "$" - else: - str_right_esc = r"\ " + re.escape(str_right.strip()) + "$" - regex_embed = str_left_esc + str_middle_esc + str_right_esc - - # logger.debug("Unknown expression: '%s'", str_middle_esc) - # logger.debug("Left context: '%s'", str_left_esc) - # logger.debug("Right context: '%s'", str_right_esc) - logger.debug("Regular expression: %s", regex_embed) - - return re.compile(regex_embed, re.IGNORECASE) - - -def _build_sel_rows(str_left: str, str_middle: str, str_right: str) -> List[Tuple[int, str]]: - """ - Generate list of words for limiting the search space via word index - - :param str_left: - :param str_middle: - :param str_right: - :return: - """ - ngramstat = resource_factory.get_ngramstat() - index = resource_factory.get_index() - - sel_rows = [] # type: List[Tuple[int,str]] - all_sets = [] # type: List[Set[int]] - - str_complete = str_left.strip() + " " + str_middle.strip() + " " + str_right.strip() - all_tokens = str_complete.split(" ") - for token in all_tokens: - if token not in ("", ""): - all_sets.append(index[token]) - ngram_selection = set.intersection(*all_sets) - for ngram_id in ngram_selection: - sel_rows.append(ngramstat[ngram_id]) - - logger.debug("Number of matching ngrams by word index: %d", len(sel_rows)) - - return sel_rows - - -def _build_all_beds(sel_rows: List[Tuple[int, str]], regex_embed: Pattern[AnyStr], - finder_constraints: FinderConstraints) -> List[Tuple[int, str]]: - """ - - #max_num_tokens: int, min_num_tokens: int, minfreq: int, maxcount: int - - :param sel_rows: - :param regex_embed: - :param finder_constraints: - :return: - """ - all_beds = [] # type: List[Tuple[int,str]] - - count = 0 - - for row in sorted(sel_rows, reverse=True): # iteration through all matching ngrams - # input("press key!) - (freq, ngram) = row - #logger.debug("%d => %s", freq, ngram) - - ngram_card = ngram.count(" ") + 1 # cardinality of the nGram - # Filter by ngram cardinality - if finder_constraints.max_num_tokens >= ngram_card >= finder_constraints.min_num_tokens: - # watch out for multiword input str_middle - # TODO: min should be at least 1 plus cardinality of middle term - if freq >= finder_constraints.min_freq: - # might suppress low n-gram frequencies - # TODO: probably best 1, could be increased for performance - # the current ngram dump and index were created on a lower frequency bound of 2 - # This is the reason, some acronyms cannot be resolved - # recommendation: recreate - stripped_ngram = ngram.strip() - match = regex_embed.search(stripped_ngram) - # all_beds collects all contexts in which the unknown string - # is embedded. - # the length of right and left part of "bed" is only - # limited by the length of the ngram - if match is not None and row not in all_beds: - all_beds.append(row) - # logger.debug("%d: %s", freq, ngram) - count += 1 - if count >= finder_constraints.max_count: - logger.debug("List cut at %d", count) - break - - # logger.debug("COUNT: %d", count) - - return all_beds - - -def _debug_embeddings(all_beds: List[Tuple[int, str]]) -> None: - logger.debug("Embeddings:") - for (freq, ngram) in all_beds: - logger.debug("%d\t%s", freq, ngram) - logger.debug("Generated list of %d matching n-grams", len(all_beds)) - - -def _find_middle(str_middle: str, sel_beds: List[Tuple[int, str]], - max_num: int) -> List[Tuple[int, str]]: - """ - - :param str_middle: - :param sel_beds: - :param max_num: - :return: - """ - ngramstat = resource_factory.get_ngramstat() - index = resource_factory.get_index() - - out = [] # type: List[Tuple[int,str]] - - str_middle_esc = re.escape(str_middle.strip()) - - # print("----------------------------------------------") - for row in sel_beds: # iterate through extract - (freq, ngram) = row - bed = ngram.strip() - - new_sets = [] - regex_bed = "^" + re.escape(bed) + "$" - regex_bed = regex_bed.replace(str_middle_esc, "(.*)") - compiled_regex_bed = re.compile(regex_bed, re.IGNORECASE) - surroundings = bed.replace(str_middle + " ", "").split(" ") - for word in surroundings: - # logger.debug("Surrounding str_middle: %s", word) - new_sets.append(index[word]) - ngrams_with_surroundings = list(set.intersection(*new_sets)) - # logger.debug( - # "Size of list that includes surrounding elements: %d", - # len(ngrams_with_surroundings)) - ngrams_with_surroundings.sort(reverse=True) - # Surrounding list sorted - counter = 0 - for ngram_id in ngrams_with_surroundings: - if counter > max_num: - break - (freq, ngram) = ngramstat[ngram_id] - stripped_ngram = ngram.strip() - match = compiled_regex_bed.search(stripped_ngram) - if match is not None: - # logger.debug(regex_bed) - # logger.debug(row) - long_form = match.group(1).strip() - if len(long_form) > len(str_middle) and \ - constants.LINE_BREAK not in long_form and \ - constants.DIGIT_MARKER not in long_form: - # logger.debug(ngramfrecquency, long_form, " [" + ngram + "]") - counter += 1 - rec = (freq, long_form.strip()) - if rec not in out: - out.append(rec) - - out.sort(reverse=True) - return out diff --git a/acres/preprocess/dumps.py b/acres/preprocess/dumps.py index cb87037..2c918eb 100644 --- a/acres/preprocess/dumps.py +++ b/acres/preprocess/dumps.py @@ -3,70 +3,28 @@ .. codeauthor:: Stefan Schulz """ -import collections import logging -from typing import Dict, Set, List, Tuple, Optional +from typing import Dict, Tuple from acres import constants -from acres.preprocess import resource_factory -from acres.util import acronym from acres.util import functions from acres.util import text logger = logging.getLogger(__name__) -def create_corpus_char_stat_dump(corpus_path: str, ngramlength: int = 8) -> Dict[str, int]: - """ - Takes a corpus consisting of text files in a single directory, substitutes digits and line - breaks, and generates statistics of character ngrams including the digit and break substitutes. - - Purpose: To substitute artificial breaks in a corpus. - - :param corpus_path: - :param ngramlength: - :return: - """ - texts = functions.robust_text_import_from_dir(corpus_path) - - logger.info("Creating character ngrams from %d documents...", len(texts)) - - dict_char_ngrams = {} # type: Dict[str, int] - - for doc in texts: - str_doc = "" - lines = doc.split("\n") - for line in lines: - line = text.clear_digits(line, constants.DIGIT_MARKER) - str_doc = str_doc + line.strip() + constants.LINE_BREAK - for i in range(0, len(str_doc) - (ngramlength - 1)): - ngram = str_doc[0 + i: ngramlength + i] - if len(ngram) == ngramlength: - if ngram not in dict_char_ngrams: - dict_char_ngrams[ngram] = 1 - else: - dict_char_ngrams[ngram] += 1 - - return dict_char_ngrams - - def create_corpus_ngramstat_dump(corpus_path: str, min_freq: int, min_length: int = 1, - max_length: int = 7, fix_lines: bool = True) -> Dict[str, int]: + max_length: int = 7) -> Dict[str, int]: """ Takes a corpus consisting of text files in a single directory Substitutes digits and line breaks It requires that all documents are in UTF-8 text. - It can perform line break cleansing (removes artificial line breaks) - and substitutions of digits. - For fixing the lines, a character ngram stat dictionary, - CREATED FROM THE SAME OR A SIMILAR - CORPUS, character_ngrams.p must be in place. + It can perform substitutions of digits. :param corpus_path: :param min_freq: :param min_length: :param max_length: - :param fix_lines: :return: """ @@ -84,9 +42,6 @@ def create_corpus_ngramstat_dump(corpus_path: str, min_freq: int, min_length: in if counter % 1000 == 0: logger.debug("%d/%d", counter, length) - if fix_lines: - doc = text.fix_line_endings(doc) - # TODO normalize case if not acronym? # TODO normalize german characters: ä => ae # TODO normalize c = k (soundex?) @@ -94,7 +49,7 @@ def create_corpus_ngramstat_dump(corpus_path: str, min_freq: int, min_length: in # ("Belastungs-Dyspnoe" = "Belastungs Dyspnoe" = "Belastungsdyspnoe") # doc = text.tokenize(doc) - doc = text.clean(doc, fix_lines) + doc = text.clean(doc) doc = text.clear_digits(doc, constants.DIGIT_MARKER) @@ -152,94 +107,3 @@ def create_indexed_ngrams(ngrams: Dict[str, int]) -> Dict[int, Tuple[int, str]]: output[identifier] = (freq, ngram) identifier += 1 return output - - -def create_index(ngramstat: Dict[int, Tuple[int, str]]) -> Dict[str, Set[int]]: - """ - Create an inverted index for performance issue when retrieving ngram records. - - :param ngramstat: - :return: - """ - index = collections.defaultdict(set) # type: Dict[str, Set[int]] - for identifier in ngramstat: - # XXX Think about trie data structure - # logger.debug(ngramstat[ID]) - (_, ngram) = ngramstat[identifier] - words = ngram.split(" ") - for word in words: - index[word].add(identifier) - if len(word) > 1 and not word[-1].isalpha(): - index[word[0:-1]].add(identifier) - - return index - - -def create_acro_dump() -> List[str]: - """ - Creates and dumps set of acronyms from ngram statistics. - - :return: - """ - # acronym_ngrams = resource_factory.get_acronym_ngrams() - # for i in acronym_ngrams: - # logger.debug(i) - counter = 0 - acronyms = [] # type: List[str] - - ngram_stat = resource_factory.get_ngramstat() - for entry in ngram_stat: - row = (ngram_stat[entry]) - (_, ngram) = row - if ngram.isalnum() and constants.DIGIT_MARKER not in ngram: - if acronym.is_acronym(ngram, 7): - # plausible max length for German medical language - if ngram not in acronyms: - acronyms.append(ngram) - counter += 1 - - return acronyms - - -def create_new_acro_dump() -> List[str]: - """ - - :return: - """ - - counter = 0 - new_acronym_ngrams = [] - - ngram_stat = resource_factory.get_ngramstat() - for _, freq_ngram in ngram_stat.items(): - (_, ngram) = freq_ngram - if " " in ngram: - tokens = ngram.split(" ") - for token in tokens: - if acronym.is_acronym(token, 7): - new_acronym_ngrams.append(ngram) - counter += 1 - break - - return new_acronym_ngrams - - -def create_morpho_dump(lexicon_file: str, append_to: Optional[Set] = None) -> Set[str]: - """ - Creates and dumps set of plausible English and German morphemes - from morphosaurus dictionary. - TODO: created rather quick & dirty, only for scoring acronym resolutions - - :return: - """ - append_to = append_to or set() - - with open(lexicon_file) as file: - for row in file: - if "" in row: - row = row.strip()[5:-6] - row = row.replace("z", "c").replace("k", "c") - # logger.debug(row) - append_to.add(row) - - return append_to diff --git a/acres/preprocess/resource_factory.py b/acres/preprocess/resource_factory.py index dd0a373..c036acc 100644 --- a/acres/preprocess/resource_factory.py +++ b/acres/preprocess/resource_factory.py @@ -6,12 +6,12 @@ import logging import os.path import pickle -from typing import Dict, Set, List, Tuple, Any +from typing import Dict, List, Tuple, Any from gensim.models import Word2Vec from acres.fastngram import fastngram -from acres.nn import train +from acres.word2vec import train from acres.preprocess import dumps from acres.stats import dictionary from acres.util import functions @@ -22,8 +22,7 @@ PICKLE_FOLDER = "models/pickle/" NGRAMS_FOLDER = "models/ngrams/" -LOG_FOLDER = "models/log/" -NN_MODELS_FOLDER = "models/nn/" +NN_MODELS_FOLDER = "models/word2vec/" DATA_FOLDER = functions.import_conf("CORPUS_PATH") VERSION = "V10" @@ -31,99 +30,14 @@ # minimal number of occurrences of a word ngram in the corpus MIN_FREQ = 2 -MORPHEMES = set() # type: Set[str] -INDEX = {} # type: Dict[str, Set[int]] NN_MODEL = None # type: Word2Vec NGRAMSTAT = {} # type: Dict[int, Tuple[int,str]] -CHARACTER_NGRAMS = {} # type: Dict[str, int] WORD_NGRAMS = {} # type: Dict[str, int] DICTIONARY = {} # type: Dict[str, List[str]] CONTEXT_MAP = {} # type: Dict[int, fastngram.ContextMap] CENTER_MAP = {} # type: Dict[int, fastngram.CenterMap] -def get_log_corpus_filename() -> str: - """ - Get the full path to the `logCorpus.txt` file. - - :return: - """ - os.makedirs(os.path.dirname(LOG_FOLDER), exist_ok=True) - return LOG_FOLDER + "logCorpus.txt" - - -def get_log_web_filename() -> str: - """ - Get the full path to the `logWebs.txt` file. - - :return: - """ - os.makedirs(os.path.dirname(LOG_FOLDER), exist_ok=True) - return LOG_FOLDER + "logWebs.txt" - - -def get_morphemes() -> Set[str]: - """ - Lazy load the set of morphemes. - - Loading order is as follows: - 1. Variable; - 2. Pickle file; - 3. Generation. - - :return: - """ - global MORPHEMES - - if not MORPHEMES: - output_file = PICKLE_FOLDER + "morphemes.p" - - if not os.path.isfile(output_file): - _log_file_not_found(output_file) - - morph_eng = functions.import_conf("MORPH_ENG") - morph_ger = functions.import_conf("MORPH_GER") - - morphemes = dumps.create_morpho_dump(morph_eng) - morphemes = dumps.create_morpho_dump(morph_ger, morphemes) - - _dump(morphemes, output_file) - - _log_file_found(output_file) - MORPHEMES = _load(output_file) - - return MORPHEMES - - -def get_index() -> Dict[str, Set[int]]: - """ - Lazy load the inverted index of ngrams. - - Loading order is as follows: - 1. Variable; - 2. Pickle file; - 3. Generation. - - :return: - """ - global INDEX - - if not INDEX: - output_file = PICKLE_FOLDER + "index-" + str(MIN_FREQ) + "-" + VERSION + ".p" - - if not os.path.isfile(output_file): - _log_file_not_found(output_file) - - ngramstat = get_ngramstat() - index = dumps.create_index(ngramstat) - _dump(index, output_file) - - _log_file_found(output_file) - INDEX = _load(output_file) - - return INDEX - - def get_word_ngrams() -> Dict[str, int]: """ Lazy load a not-indexed representation of ngrams. @@ -145,7 +59,7 @@ def get_word_ngrams() -> Dict[str, int]: _log_file_not_found(pickle_output_file) _log_file_not_found(ngram_output_file) - word_ngrams = dumps.create_corpus_ngramstat_dump(DATA_FOLDER, MIN_FREQ, fix_lines=False) + word_ngrams = dumps.create_corpus_ngramstat_dump(DATA_FOLDER, MIN_FREQ) write_txt(word_ngrams, ngram_output_file) _dump(word_ngrams, pickle_output_file) @@ -186,82 +100,6 @@ def get_ngramstat() -> Dict[int, Tuple[int, str]]: return NGRAMSTAT -def get_acronym_ngrams() -> List[str]: - """ - Lazy load a list of ngrams containing acronyms. - - Loading order is as follows: - 1. Pickle file; - 2. Generation. - - :return: - """ - output_file = PICKLE_FOLDER + "acronymNgrams.p" - - if not os.path.isfile(output_file): - _log_file_not_found(output_file) - - acronym_ngrams = dumps.create_new_acro_dump() - _dump(acronym_ngrams, output_file) - - _log_file_found(output_file) - return _load(output_file) - - -def get_acronyms() -> List[str]: - """ - Lazy load a list of acronyms. - - Loading order is as follows: - 1. Pickle file; - 2. Generation. - - :return: - """ - output_file = PICKLE_FOLDER + "acronyms.p" - - if not os.path.isfile(output_file): - _log_file_not_found(output_file) - - acronyms = dumps.create_acro_dump() - _dump(acronyms, output_file) - - _log_file_found(output_file) - return _load(output_file) - - -def get_character_ngrams() -> Dict[str, int]: - """ - Lazy load character ngrams. - - Loading order is as follows: - 1. Variable; - 2. Pickle file; - 3. Generation. - - :return: - """ - global CHARACTER_NGRAMS - - if not CHARACTER_NGRAMS: - pickle_output_file = PICKLE_FOLDER + "character_ngrams.p" - ngram_output_file = NGRAMS_FOLDER + "character_ngrams.txt" - - if not os.path.isfile(pickle_output_file): - _log_file_not_found(pickle_output_file) - _log_file_not_found(ngram_output_file) - - character_ngrams = dumps.create_corpus_char_stat_dump(DATA_FOLDER) - - write_txt(character_ngrams, ngram_output_file) - _dump(character_ngrams, pickle_output_file) - - _log_file_found(pickle_output_file) - CHARACTER_NGRAMS = _load(pickle_output_file) - - return CHARACTER_NGRAMS - - def get_nn_model(ngram_size: int = 3, min_count: int = 1, net_size: int = 100, alpha: float = 0.025, sg: int = 0, hs: int = 0, negative: int = 5) -> Word2Vec: """ @@ -368,13 +206,10 @@ def reset() -> None: :return: """ - global MORPHEMES, INDEX, NN_MODEL, NGRAMSTAT, CHARACTER_NGRAMS + global NN_MODEL, NGRAMSTAT - MORPHEMES = set() - INDEX = {} NN_MODEL = None NGRAMSTAT = {} - CHARACTER_NGRAMS = {} def warmup_cache() -> None: @@ -383,13 +218,8 @@ def warmup_cache() -> None: :return: """ - #get_morphemes() get_word_ngrams() get_ngramstat() - get_acronym_ngrams() - get_acronyms() - get_index() - get_character_ngrams() get_nn_model() diff --git a/acres/rater/rater.py b/acres/rater/rater.py index a6eb121..3b8a73d 100755 --- a/acres/rater/rater.py +++ b/acres/rater/rater.py @@ -3,12 +3,10 @@ """ import logging -from typing import Tuple from acres.rater import expansion from acres.rater import full as full_rater from acres.util import acronym as acro_util -from acres.util import variants as varianter logger = logging.getLogger(__name__) @@ -107,68 +105,3 @@ def get_acronym_score(acro: str, full: str) -> float: return 0 return _calc_score(acro, full) - - -def get_acronym_score_variants(acro: str, full: str) -> float: - """ - Wrapper for `get_acronym_score` that takes variants into consideration. - - For checking for valid German expansions it is important to consider variants, - therefore invoke spelling variant generator from `varianter.generate_all_variants_by_rules`. - At this place more rules can be added. - - Typical substitutions, mostly concerning the inconsistent use of k, c, and z in clinical texts - can be enhanced by frequent translations in `varianter.generate_all_variants_by_rules`. - - Return the score of the best variant. - - .. deprecated:: 0.1 - Variants have not been used recently (e.g. not used in Michel's PhD Thesis). - - :param acro: - :param full: - :return: - """ - max_score = 0.0 - variants = varianter.generate_all_variants_by_rules(full) - for variant in variants: - max_score = max(max_score, get_acronym_score(acro, variant)) - return max_score - - -def get_acro_def_pair_score(acro: str, full: str) -> Tuple[str, float]: - """ - Wrapper function for `get_acronym_score` that takes possible acronym-definition pairs into - account. - - The scoring function should work both for acronyms extracted from a corpus (for which strict - matching criteria should be applied) and for acronyms harvested from the Web for which the - criteria may be relaxed once strong evidence from acronym - definition patterns exist, e.g. - "ARDS (akutes Atemnotsyndrom)". - There might be acronym - definition patterns in well-written clinical documents. - - In the latter case, full would take this form, i.e. a string that contains both the acronym and - the expansion. - - :param acro: - :param full: - :return: - """ - is_acronym_definition_pair = False - definition = full - - # full form contains an acronym definition pattern (normally only yielded - # from Web scraping, unlikely in clinical texts) - # acronym is included; is then removed from full form - acro_def_pattern = acro_util.extract_acronym_definition(full, 7) - if acro_def_pattern is not None: - is_acronym_definition_pair = True - if acro_def_pattern[0] == acro: - definition = acro_def_pattern[1] - # high score, but also might be something else - - # XXX Maybe we shouldn't consider variants in case it's an acronym-definition pair - score = get_acronym_score_variants(acro, definition) - if is_acronym_definition_pair: - score *= 10 - return definition, score diff --git a/acres/resolution/resolver.py b/acres/resolution/resolver.py index dfd6eb8..698460a 100644 --- a/acres/resolution/resolver.py +++ b/acres/resolution/resolver.py @@ -7,8 +7,7 @@ from typing import List, Iterator from acres.fastngram import fastngram -from acres.ngram import finder -from acres.nn import test +from acres.word2vec import test from acres.rater import rater from acres.stats import dictionary from acres.util import text @@ -18,7 +17,6 @@ class Strategy(IntEnum): """ Enum that holds acronym-solving strategies. """ - NGRAM = 1 WORD2VEC = 2 DICTIONARY = 3 FASTNGRAM = 4 @@ -59,7 +57,6 @@ def resolve(acronym: str, left_context: str, right_context: str, strategy: Strat :return: """ switcher = { - Strategy.NGRAM: finder.robust_find_embeddings, Strategy.WORD2VEC: test.find_candidates, Strategy.DICTIONARY: dictionary.expand, Strategy.FASTNGRAM: fastngram.fastngram, diff --git a/acres/stats/stats.py b/acres/stats/stats.py index 6609939..1399845 100644 --- a/acres/stats/stats.py +++ b/acres/stats/stats.py @@ -4,10 +4,8 @@ """ from typing import List -from acres import constants from acres.util import acronym from acres.util import functions -from acres.util import text as text_util class Stats: @@ -24,7 +22,6 @@ def __init__(self) -> None: self.acronym_types = 0 self.acronyms = 0 self.sentences = 0 - self.normalized_sentences = 0 def calc_stats(self, text: str) -> None: """ @@ -39,7 +36,6 @@ def calc_stats(self, text: str) -> None: self.acronym_types = Stats.count_acronyms_types(text) self.acronyms = Stats.count_acronyms(text) self.sentences = Stats.count_sentences(text) - self.normalized_sentences = Stats.count_normalized_sentences(text) @staticmethod def count_chars(text: str) -> int: @@ -113,22 +109,6 @@ def count_sentences(text: str) -> int: count += 1 return count - @staticmethod - def count_normalized_sentences(text: str) -> int: - """ - Count the number of normalized sentences in a string. - - Normalized sentences had their line endings fixed by a character n-gram model. - - :param text: - :return: - """ - count = 0 - normalized_text = text_util.fix_line_endings(text) - for _ in normalized_text.split(constants.LINE_BREAK): - count += 1 - return count - @staticmethod def _get_acronyms(text: str) -> List[str]: acronyms = [] @@ -145,7 +125,6 @@ def __str__(self) -> str: ret.append("Acronym Types: " + str(self.acronym_types) + "\n") ret.append("Acronyms: " + str(self.acronyms) + "\n") ret.append("Sentences (raw): " + str(self.sentences) + "\n") - ret.append("Sentences (normalized): " + str(self.normalized_sentences) + "\n") return ''.join(ret) def __add__(self, other: 'Stats') -> 'Stats': @@ -155,7 +134,6 @@ def __add__(self, other: 'Stats') -> 'Stats': self.acronym_types += other.acronym_types self.acronyms += other.acronyms self.sentences += other.sentences - self.normalized_sentences += other.normalized_sentences return self def __radd__(self, other: 'Stats') -> 'Stats': diff --git a/acres/util/__init__.py b/acres/util/__init__.py index 00ec25d..2e5dc44 100644 --- a/acres/util/__init__.py +++ b/acres/util/__init__.py @@ -1,6 +1,6 @@ """ Package with general utilities modules. """ -from acres.util import acronym, functions, text, variants +from acres.util import acronym, functions, text -__all__ = ['acronym', 'functions', 'text', 'variants'] +__all__ = ['acronym', 'functions', 'text'] diff --git a/acres/util/acronym.py b/acres/util/acronym.py index 3b94dc7..6f7cd7c 100644 --- a/acres/util/acronym.py +++ b/acres/util/acronym.py @@ -2,9 +2,7 @@ Utility functions related to acronyms. """ import logging -import re from collections import namedtuple -from typing import Tuple, List, Optional from acres import constants from acres.util import text @@ -14,38 +12,6 @@ Acronym = namedtuple('Acronym', ['acronym', 'left_context', 'right_context']) -def extract_acronym_definition(str_probe: str, max_length: int, - strict: bool = False) -> Optional[Tuple[str, str]]: - """ - Identifies potential acronym / definition pairs and extract acronym and definition candidates. - A necessary criterion is that the initial characters are the same - - TODO Acronym/definition pairs normally use parentheses, but also quotes and dashes can be found - - @todo Add sibling function is_acronym_definition_pair - - :param str_probe: - :param max_length: - :param strict: - :return: - """ - str_probe = str_probe.strip() - - if len(str_probe) > 6: - if str_probe[-1] == ")" and str_probe.count("(") == 1: - left = str_probe.split("(")[0].strip() # potential definition - right = str_probe.split("(")[1][0:-1].strip() # potential acronym - if strict: - if left[0].lower() != right[0].lower(): - return None - if is_acronym(left, max_length) and not is_acronym(right, max_length): - return left, right - if is_acronym(right, max_length) and not is_acronym(left, max_length): - return right, left - - return None - - def is_acronym(str_probe: str, max_length: int = 7) -> bool: """ Identifies Acronyms, restricted by absolute length @@ -90,129 +56,6 @@ def create_german_acronym(full: str) -> str: return out -def is_proper_word(str_probe: str) -> bool: - """ - A proper word is more than a single letter. - The first character may be capitalised or not, all other characters are lower case. - It must not include digits or punctuation characters (only dashes are allowed). - - :param str_probe: - :return: - """ - str_new = str_probe.replace("-", "").replace(constants.DIGIT_MARKER, "1") - if len(str_new) < 2: - return False - if not (str_probe[0].isalpha() and str_probe[-1].isalpha() and str_new.isalpha()): - return False - if not str_new[1:].islower(): - return False - return True - - -def split_expansion(acro: str, full: str) -> List[Tuple[str, ...]]: - """ - - :param acro: - :param full: - :return: - """ - if len(acro) > 7: - logger.warning("The current code is very slow for long acronyms, this may take a while...") - - bina = [] - cleaned_full = _acronym_aware_clean_expansion(acro, full) - - # TODO recursive function instead of Regex - - # TODO obvious morpheme-based scoring does not work well with this unorthodox building patterns - - # List of binary combinations of alternative regex patterns (greedy vs. non-greedy) - regs = [] - - # Iterate over the binary representations of `i`, with "0" being replaced by the greedy - # regex "*|", and "1" by the non-greedy regex "*?|". The pipe | character is just a separator, - # later used as a split character. - for i in range(0, (2 ** (len(acro) - 1))): - # Takes the binary value of i and fills it with zeroes up to the length of acronym -1 - str_bin = str(bin(i))[2:].zfill(len(acro) - 1) - - # zfill will not drop characters. In the corner case of a single letter acronym, we should - # generate an empty string. - if len(acro) == 1: - str_bin = "" - - bina.append(str_bin.replace("0", "*|").replace("1", "*?|")) - - # Iterate over the built list of expressions, each matching the initial characters in a - # different way. Then build a list of regular expressions, e.g. for "EKG": - # ^(E.*)(K.*)(G[A-Za-z0-9 ]*$) - # ^(E.*)(K.*?)(G[A-Za-z0-9 ]*$) - # ^(E.*?)(K.*)(G[A-Za-z0-9 ]*$) - # ^(E.*?)(K.*?)(G[A-Za-z0-9 ]*$) - for expr in bina: - lst_exp = expr.split("|") - i = 0 - - # Build capturing groups over each acronym character - out = "^(" - for ex in lst_exp: - out = out + re.escape(acro[i]) + "." + ex + ")(" - i += 1 - - # TODO Use Unicode matching instead of diacritics - # TODO Merge greedy and non-greedy in a single non-capturing group? - # Remove the last 3 remaining characters --- always ".)(" (because last `ex` is empty) --- - # and replace them with a group matching valid characters (alphanumeric + whitespace + - # diacritics). - regs.append(out[0:-3] + r"[\w\s]*$)") - - result = [] # type: List[Tuple[str, ...]] - for reg in regs: - if re.search(reg, cleaned_full, re.IGNORECASE) is not None: - found = re.findall(reg, cleaned_full, re.IGNORECASE)[0] - - # Avoid duplicates - if found not in result: - result.append(found) - return result - - -def _acronym_aware_clean_expansion(acronym: str, expansion: str) -> str: - """ - Remove any symbol from the expanded form, preserving hyphens, spaces and chars from the acronym. - - :param acronym: - :param expansion: - :return: - """ - ret = "" - for char in expansion: - if char.isalnum() or char in " -" or char in acronym: - ret = ret + char - else: - ret = ret + " " - return ret.strip() - - -def split_ngram(ngram: str) -> List[Tuple[str, str, str]]: - """ - Splits a token ngram with acronym(s) into all combinations of left - acro - token. - - :param ngram: - :return: - """ - out = [] - tokens = ngram.split(" ") - counter = 0 - for token in tokens: - if is_acronym(token, 7): - acronym_context = (" ".join(tokens[0:counter]), - tokens[counter], " ".join(tokens[counter + 1:])) - out.append(acronym_context) - counter += 1 - return out - - def trim_plural(acronym: str) -> str: """ Trim the german plural form out of an acronym. diff --git a/acres/util/functions.py b/acres/util/functions.py index 6a2f585..d71b798 100755 --- a/acres/util/functions.py +++ b/acres/util/functions.py @@ -7,9 +7,6 @@ from configparser import ConfigParser from typing import Dict, List, Optional, Tuple, Iterable -import requests -from requests import Response - from acres import constants logger = logging.getLogger(__name__) @@ -29,41 +26,6 @@ def import_conf(key: str) -> Optional[str]: return config['DEFAULT'][key] -def get_url(url: str, params: Optional[Dict] = None, headers: Optional[Dict] = None, - timeout: int = 2) -> Optional[Response]: - """ - Make a GET request to a given URL using proxy if necessary. - - :param url: The URL to make the request to. - :param params: GET parameters - :param headers: GET headers - :param timeout: The timeout in seconds. - :return: Object from requests.get() - """ - config = ConfigParser() - config.read("config.ini") - proxy_config = config["proxy"] - proxy_dict = None - if proxy_config["UseProxy"] == "yes": - http_proxy = proxy_config["ProxyUser"] + ":" + proxy_config["ProxyPass"] + \ - "@" + proxy_config["ProxyDomain"] + ":" + proxy_config["ProxyPort"] - https_proxy = http_proxy - ftp_proxy = http_proxy - proxy_dict = { - "http": http_proxy, - "https": https_proxy, - "ftp": ftp_proxy} - - response = None - try: - response = requests.get(url, params=params, headers=headers, timeout=timeout, - proxies=proxy_dict) - response.raise_for_status() - except requests.exceptions.RequestException as ex: - logger.critical(ex) - return response - - def create_ngram_statistics(input_string: str, n_min: int, n_max: int) -> Dict[str, int]: """ Creates a dictionary that counts each nGram in an input string. Delimiters are spaces. @@ -102,30 +64,6 @@ def create_ngram_statistics(input_string: str, n_min: int, n_max: int) -> Dict[s return output -def random_sub_list(in_list: list, max_num: int) -> list: - """ - Generates random sublist. - - :param in_list: - :param max_num: - :return: - """ - lst_out = [] - length = len(in_list) - if length <= max_num: - return in_list - counter = 0 - rnumbers = [] # type: List[int] - while counter < max_num: - rnumber = (random.randint(0, length - 1)) - if rnumber not in rnumbers: - rnumbers.append(rnumber) - counter = len(rnumbers) - for rnumber in rnumbers: - lst_out.append(in_list[rnumber]) - return lst_out - - def is_stopword(str_in: str) -> bool: """ Tests whether word is stopword, according to list. @@ -198,38 +136,6 @@ def robust_text_import_from_dir(path: str) -> List[str]: return texts -def dict_to_sorted_list(ngrams_dict: Dict[str, int]) -> List[Tuple[int, str]]: - """ - Converts a ngram dictionary to a list of tuples, ordered by decreasing frequency. - - :param ngrams_dict: - :return: - """ - output = [] - for ngram in ngrams_dict: - output.append((ngrams_dict[ngram], ngram)) - output.sort(reverse=True) - return output - - -def corpus_to_ngram_list(corpus: str, min_num_tokens: int, - max_num_tokens: int) -> List[Tuple[int, str]]: - """ - Generates a ngram list, sorted by frequency, out of a corpus. - - Upper bound of ngram length may be set according to acronym length - Rule of thumb: acronym length + 4, in order to safely retrieve acronym / definition - pairs. Not that also quotes, dashes and parentheses count as single tokens - - :param corpus: - :param min_num_tokens: - :param max_num_tokens: - :return: - """ - stats = create_ngram_statistics(corpus, min_num_tokens, max_num_tokens) - return dict_to_sorted_list(stats) - - def partition(word: str, partitions: int) -> int: """ Find a bucket for a given word. diff --git a/acres/util/text.py b/acres/util/text.py index 59d7b45..8e985cb 100644 --- a/acres/util/text.py +++ b/acres/util/text.py @@ -2,75 +2,8 @@ Utility functions related to text processing. """ import re -import string from acres import constants -from acres.preprocess import resource_factory - - -def fix_line_endings(long_text: str, char_ngram_length: int = 8, - line_break_marker_position: int = 3) -> str: - """ - Addresses the problem that many texts come with artificial line breaks. - - These breaks are removed if distributional data show that an unbroken continuation of the text - is more likely than the break. - - :param long_text: - :param char_ngram_length: - :param line_break_marker_position: - :return: - """ - char_ngram_dict = resource_factory.get_character_ngrams() - - line_break_marker = constants.LINE_BREAK - - out = "" - long_text = long_text.strip().replace("\n", line_break_marker) - i = 0 - while i + char_ngram_length < len(long_text): - char = long_text[i] - ngr = long_text[i:i + char_ngram_length] - - # line break marker at nth position - if ngr[line_break_marker_position] == line_break_marker: - ngr_clean = clear_digits(ngr, constants.DIGIT_MARKER) - ngr_clean_space = ngr_clean.replace(line_break_marker, " ") - if ngr_clean in char_ngram_dict: - n_breaks = char_ngram_dict[ngr_clean] - else: - n_breaks = 0 - if ngr_clean_space in char_ngram_dict: - n_spaces = char_ngram_dict[ngr_clean_space] - else: - n_spaces = 0 - # logger.debug("----") - # logger.debug(ngr) - # logger.debug("With new line: %s", n_breaks) - # logger.debug("With space: %s", n_spaces) - if n_spaces > n_breaks: - # TODO: line_break_marker as delimiter - # What happens if the break marker symbol also occurs in the original text - # probably safe: using the "¶" character for line breaks - # Check for whole code how delimiters are handled and how this - # might interfere with text processing - out = out + ngr.replace(line_break_marker, " ") - i = i + char_ngram_length - if i >= len(long_text): - break - else: - out = out + char - i = i + 1 - if i == len(long_text): - break - else: - out = out + char - i = i + 1 - if i == len(long_text): - break - - out = out + long_text[i:] + line_break_marker - return out def clear_digits(str_in: str, substitute_char: str) -> str: @@ -93,56 +26,6 @@ def clear_digits(str_in: str, substitute_char: str) -> str: return out -def transliterate_to_seven_bit(str_in: str) -> str: - """ - Converts string to 7-bit ASCII, considering language - specific rules, - such as in German "Ä" -> "AE", in English "Ä" -> "A" - Considering in-built capitalization rules such as "ß" -> "SS" - TODO: completing transliteration rules when non-Western languages are used - consider using unidecode - - :param str_in: - :return: - """ - substitutions = { - "À": "A", - "Á": "A", - "Â": "A", - "Ã": "A", - "Ä": "A", - "Å": "A", - "Æ": "AE", - "Ç": "C", - "È": "E", - "É": "E", - "Ê": "E", - "Ë": "E", - "Ì": "I", - "Í": "I", - "Î": "I", - "Ï": "I", - "Ñ": "N", - "Ò": "O", - "Ó": "O", - "Ô": "O", - "Õ": "O", - "Ö": "O", - "Ø": "O", - "Ù": "U", - "Ú": "U", - "Û": "U", - "Ü": "U"} - - if constants.LANGUAGE == "de": - substitutions["Ä"] = "AE" - substitutions["Å"] = "AA" - substitutions["Ö"] = "OE" - substitutions["Ø"] = "OE" - substitutions["Ü"] = "UE" - - return "".join([substitutions.get(c, c) for c in str_in.upper()]) - - def reduce_repeated_chars(str_in: str, char: str, remaining_chars: int) -> str: """ :param str_in: text to be cleaned @@ -164,36 +47,6 @@ def reduce_repeated_chars(str_in: str, char: str, remaining_chars: int) -> str: return out -def replace_punctuation(punctuated: str) -> str: - """ - Replaces punctuation marks (as defined by pyhton string collection) by a whitespace. - - :param punctuated: Punctuated string. - :return: A non-punctuated string. - """ - _punctuation = set(string.punctuation) - for punct in set(punctuated).intersection(_punctuation): - punctuated = punctuated.replace(punct, ' ') - return ' '.join(punctuated.split()) - - -def context_ngram(words: str, size: int, reverse: bool = False) -> str: - """ - Reduces a given sentence to `size` words, to be used as a context n-gram. - - If `reverse` is `True`, the last `size` words are used, commonly employed as a left context. - - :param words: - :param size: - :param reverse: - :return: - """ - if size == 0: - return "" - tokens = words.split(" ") - return " ".join(tokens[-size:]) if reverse else " ".join(tokens[:size]) - - def remove_duplicated_whitespaces(whitespaced: str) -> str: """ Clean up an input string out of any number of repeated whitespaces. diff --git a/acres/util/variants.py b/acres/util/variants.py deleted file mode 100644 index d62c2ae..0000000 --- a/acres/util/variants.py +++ /dev/null @@ -1,149 +0,0 @@ -""" -Module to generate string variants. - -.. deprecated:: 0.1 - Variants have not been used recently (e.g. not used in Michel's PhD Thesis). -""" -from typing import List, Union, Tuple - - -def _resolve_ambiguous_lists(lists: List[List[Union[str, Tuple[str, str]]]]) -> \ - List[List[Union[str, Tuple[str, str]]]]: - """ - - :param lists: - :return: - """ - for a_list in lists: - list0 = [] # type: List[Union[str, Tuple[str, str]]] - list1 = [] # type: List[Union[str, Tuple[str, str]]] - is_open = True - is_tuple = False - for element in a_list: - if isinstance(element, tuple) and is_open: - list0.append(element[0]) - list1.append(element[1]) - is_open = False - is_tuple = True - else: - list0.append(element) - list1.append(element) - if is_tuple: - lists.append(list0) - lists.append(list1) - else: - return lists - - return [[]] - - -def _create_string_variants_as_list(a_string: str, search: str, replace: str) -> \ - List[Union[str, Tuple[str, str]]]: - """ - Analyses a string a_string for all substrings. - - :param a_string: - :param search: - :param replace: - :return: A list constituted by non-substitutable strings and/or search/replace pairs - """ - if search == "": - return [a_string] - ret = [] # type: List[Union[str, Tuple[str, str]]] - i = 0 - built_string = "" - while True: - char = a_string[i] - j = i + len(search) - if a_string[i:j] == search: - if built_string != "": - ret.append(built_string) - built_string = "" - ret.append((search, replace)) - i = i + len(search) - else: - built_string = built_string + char - i = i + 1 - if i >= len(a_string): - if built_string != "": - ret.append(built_string) - return ret - - -def _list_to_string(a_list: List[Union[str, Tuple[str, str]]]) -> str: - """ - transforms input of list - if a list element is not a string: -> empty string - - :param a_list: - :return: - """ - out = "" - for element in a_list: - if isinstance(element, str): - out = out + element - else: - return "" - return out - - -def _list_all_string_variants(a_string: str, search: str, replace: str) -> List[str]: - """ - - :param a_string: - :param search: - :param replace: - :return: - """ - out = [] - # XXX Why do we need to encapsulate the return of _create_string_variants_as_list in a list? - a_list = _resolve_ambiguous_lists([_create_string_variants_as_list(a_string, search, replace)]) - for element in a_list: - a_string = _list_to_string(element) - if a_string != "": - out.append(a_string) - return out - - -def generate_all_variants_by_rules(raw_string: str) -> List[str]: - """ - - :param raw_string: - :return: - """ - rules = [ - ("druck", " pressure"), - ("krankheit", " Disorder"), - ("fa", "pha"), ("Fa", "Pha"), - ("fe", "phe"), ("Fe", "Phe"), - ("fi", "phi"), ("Fi", "Phi"), - ("fo", "pho"), ("Fo", "Pho"), - ("fu", "phu"), ("Fu", "Phu"), - ("fy", "phy"), ("Fy", "Phy"), - ("fä", "phä"), ("Fä", "Phä"), - ("fö", "phö"), ("Fö", "Phö"), - ("fü", "phü"), ("Fü", "Phü"), - ("ka", "ca"), ("Ka", "Ca"), - ("ko", "co"), ("Ko", "Co"), - ("ku", "cu"), ("Ku", "Cu"), - ("zy", "cy"), ("Zy", "Cy"), - ("zi", "ci"), ("Zi", "Ci"), - ("ze", "ce"), ("Ze", "Ce"), - ("kl", "cl"), ("Kl", "Cl"), - ("kr", "cr"), ("Kr", "Cr"), - ("kn", "cn"), ("Kn", "Cn"), - ("kz", "cc"), - # TODO remove. Use `transliterate_to_seven_bit` on input first - ("ö", "e"), ("Ö", "E"), # because of esophagus - ("ü", "ue"), ("Ü", "Ue"), - ("ä", "ae"), ("Ä", "Ae")] - - out = [raw_string] - - for rule in rules: - for a_string in out: - new_list = _list_all_string_variants(a_string, rule[0], rule[1]) - for element in new_list: - if element not in out: - out.append(element) - return out diff --git a/acres/web/__init__.py b/acres/web/__init__.py deleted file mode 100644 index 62cd4e5..0000000 --- a/acres/web/__init__.py +++ /dev/null @@ -1,12 +0,0 @@ -""" -.. deprecated:: 0.1 - Web-based expansion has not been used recently (e.g. not used in Michel's PhD Thesis) because it - is not reproducible. Consider using a fixed corpus instead or an ensemble model with a fixed - sense inventory. - -Package grouping modules related to the web expansion strategy. It contains two implementations of -web-based resolution, namely based on the Azure API (`azure`) and on parsing Bing's SERP (`bing`). -""" -from acres.web import azure, base, bing - -__all__ = ['azure', 'base', 'bing'] diff --git a/acres/web/azure.py b/acres/web/azure.py deleted file mode 100644 index 871352c..0000000 --- a/acres/web/azure.py +++ /dev/null @@ -1,148 +0,0 @@ -""" -Module for querying the Bing Web Search API v7. -""" -import logging -import os.path -import pickle -from collections import namedtuple -from typing import List, Tuple, Dict, Any, Optional, MutableMapping - -from acres.util import functions -from acres.util import text - -logger = logging.getLogger(__name__) - -WebResult = namedtuple('WebResult', ['name', 'url', 'language', 'snippet']) - -# TODO prune cache automatically -RESULTS_CACHE = {} # type: Dict[str, Optional[List[WebResult]]] - - -def get_web_corpus(query: str) -> str: - """ - Generates a pseudo-corpus out of the web results for a given query. - - :param query: - :return: A string containing all titles and snippets for 50 web results for the query. - """ - web_results = cached_get_web_results(query) - if not web_results: - return "" - - output = [] - for result in web_results: - output.append(result.name) - output.append(result.snippet) - return ' '.join(output) - - -def cached_get_web_results(query: str) -> Optional[List[WebResult]]: - """ - Cached version of `get_web_results`. - - :param query: - :return: - """ - global RESULTS_CACHE - - if not RESULTS_CACHE: - _load_cache() - - # Trailing whitespaces make no difference - cleaned_query = text.clean_whitespaces(query) - - if cleaned_query not in RESULTS_CACHE: - web_results = get_web_results(cleaned_query) - RESULTS_CACHE[cleaned_query] = web_results - _persist_cache() - - return RESULTS_CACHE[cleaned_query] - - -def _load_cache() -> None: - global RESULTS_CACHE - - if os.path.isfile(_get_cache_name()): - RESULTS_CACHE = pickle.load(open(_get_cache_name(), "rb")) - - -def _persist_cache() -> None: - pickle.dump(RESULTS_CACHE, open(_get_cache_name(), "wb")) - - -def _get_cache_name() -> str: - return "cache/azure.p" - - -def get_web_results(query: str) -> Optional[List[WebResult]]: - """ - Queries Bing using a given term and returns a list of WebResults. - - If nothing is found, returns `None`. - - When possible, prefer cached_get_web_results, which uses a cache of results. - - :param query: - :return: - """ - - headers, response = __query(query) - logger.debug(headers) - logger.debug(response) - # logger.debug(json.dumps(response, indent=4)) - - if "webPages" not in response: - return None - - # 'id': 'https://api.cognitive.microsoft.com/api/v7/#WebPages.0', - # 'name': 'Elektrokardiogramm – Wikipedia', - # 'url': 'https://de.wikipedia.org/wiki/Elektrokardiogramm', - # 'about': [{'name': 'Electrocardiography'}], - # 'isFamilyFriendly': True, - # 'displayUrl': 'https://de.wikipedia.org/wiki/Elektrokardiogramm', - # 'snippet': 'Das Elektrokardiogramm (EKG) (zu altgriechisch καρδία kardía, deutsch ‚Herz‘, und - # γράμμα grámma, deutsch ‚Geschriebenes‘) ist die Aufzeichnung ...', - # 'dateLastCrawled': '2018-08-18T10:10:00.0000000Z', - # 'language': 'de', - # 'isNavigational': False - results = [] - for value in response["webPages"]["value"]: - web_result = WebResult(name=value['name'], url=value['url'], language=value['language'], - snippet=value['snippet']) - results.append(web_result) - - return results - - -def __query(query: str) -> Tuple[MutableMapping[str, str], Any]: - """ - Queries Bing using a given term and returns a JSON representation of the results. - Requires a valid `BingSearchApiKey` set on config.ini. - - :param query: A query term. - :return: A tuple containining response headers and a JSON representation of the web results. - """ - assert is_valid_key() - subscription_key = functions.import_conf("BingSearchApiKey") - - logger.warning("Querying Bing... This API call will be charged.") - - search_url = "https://api.cognitive.microsoft.com/bing/v7.0/search" - - headers = {"Ocp-Apim-Subscription-Key": subscription_key} - params = {"q": query, - "count": 50, # max: 50 - "mkt": "de-AT", - "responseFilter": "Webpages"} - response = functions.get_url(search_url, headers=headers, params=params) - return response.headers, response.json() - - -def is_valid_key() -> bool: - """ - Checks if the Bing key is valid. - - :return: - """ - key = functions.import_conf("BingSearchApiKey") - return isinstance(key, str) and len(key) == 32 diff --git a/acres/web/base.py b/acres/web/base.py deleted file mode 100644 index 43d987f..0000000 --- a/acres/web/base.py +++ /dev/null @@ -1,115 +0,0 @@ -""" -Base module for web-based acronym resolution. -""" - -import logging -from typing import List, Tuple - -from acres import constants -from acres.rater import rater -from acres.util import functions -from acres.util import text -from acres.web import azure, bing - -logger = logging.getLogger(__name__) - -# Enables logging for under the hood libraries -logging.getLogger("requests").setLevel(logging.DEBUG) -logging.getLogger("urllib3").setLevel(logging.DEBUG) - - -def get_best_acronym_web_resolution(left: str, acro: str, right: str, minimum_len: int, - maximum_word_count: int) -> Tuple[str, float]: - """ - This is the main file to be used to leverage Bing search for resolving acronyms - - @todo call find_embeddings on the web corpus instead? - - :param left: left context of acronym to be expanded (any length) - :param acro: acronym to be expanded - :param right: right context of acronym to be expanded (any length) - :param minimum_len: the minimum length of the context words to be considered (e.g. to exclude \ - short articles etc.) - :param maximum_word_count: the maximum of context words that are put into the query - :return: best expansion of acronym, rating - """ - ngrams = get_web_dump_from_acro_with_context(left, acro, right, minimum_len, maximum_word_count) - - old_weight = 0.0 - weight = 0.0 - out = "" - - for (freq, ngram) in ngrams: - (full, score) = rater.get_acro_def_pair_score(acro, ngram) - if score > 0.0: - logger.debug("%.2f %s", score, full) - weight = freq * score - if weight > old_weight: - out = full - old_weight = weight - - return out, weight - - -def get_web_dump_from_acro_with_context(left: str, acro: str, right: str, min_word_len: int, - n_context: int, max_tokens_in_ngram: int = 8) \ - -> List[Tuple[int, str]]: - """ - This routine throws acronyms with left and right context (like in Excel table) to Bing and - generates an n-gram statistic - - :param acro: acronym - :param left: left context - :param right: right context - :param: min_word_len: minimal length of a context word - :return: token ngram list with possible acronym expansion - """ - - cleaned_left_context = [] - cleaned_right_context = [] - proper_context = [] - # reduce right and left context to words of minimal length min_word_len - # writing into the same tuple, alternating - left = text.replace_punctuation(left) - right = text.replace_punctuation(right) - left_context = left.split(" ") - # left_context = left_context.reverse() - right_context = right.split(" ") - for word in reversed(left_context): - if len(word) >= min_word_len: - if not (constants.DIGIT_MARKER in word or constants.LINE_BREAK in word): - cleaned_left_context.append(word) - for word in right_context: - if len(word) >= min_word_len: - if not (constants.DIGIT_MARKER in word or constants.LINE_BREAK in word): - cleaned_right_context.append(word) - i = 0 - while True: - if i < len(cleaned_left_context): - proper_context.append(cleaned_left_context[i]) - if i < len(cleaned_right_context): - proper_context.append(cleaned_right_context[i]) - i = i + 1 - if i >= len(cleaned_left_context) and i >= len(cleaned_right_context): - break - # now we have a list with the context words starting with the ones closest to the acronym - # in Bing the order of tokens in a query matters. Therefore the query must start with the - # acronym - query_tokens = [acro] + proper_context[:n_context] - query = " ".join(query_tokens) - return ngrams_web_dump(query, 1, max_tokens_in_ngram) - - -def ngrams_web_dump(query: str, min_num_tokens: int, max_num_tokens: int) -> List[Tuple[int, str]]: - """ - - :param query: - :param min_num_tokens: - :param max_num_tokens: - :return: - """ - if azure.is_valid_key(): - corpus = azure.get_web_corpus(query) - else: - corpus = bing.get_web_corpus(query) - return functions.corpus_to_ngram_list(corpus, min_num_tokens, max_num_tokens) diff --git a/acres/web/bing.py b/acres/web/bing.py deleted file mode 100644 index 7fb9d4a..0000000 --- a/acres/web/bing.py +++ /dev/null @@ -1,95 +0,0 @@ -""" -Module for querying Bing directly and parsing the SERP. - -.. codeauthor:: Stefan Schulz -""" -import logging -import time -import random -from typing import List, Tuple - -import html2text - -from acres.util import functions - -logger = logging.getLogger(__name__) - - -def get_web_corpus(query: str) -> str: - """ - Manually queries Bing for a given query to obtain a web corpus from the first return page. - - Should be used carefully, with delay. - - :param query: - :return: - """ - query = query.replace(" ", " ") - - # Escape HTML - query = query.replace(" ", "+") - query = query.replace("\"", "%22") - - time.sleep(random.randint(0, 2000) / 1000) - return get_url_corpus("http://www.bing.de/search?cc=de&q=" + query) - - -def get_url_corpus(url: str) -> str: - """ - Generates a pseudo-corpus out of a given URL. - - :param url: - :return: - """ - logger.info("Sending HTTP request to %s...", url) - response = functions.get_url(url) - if not response: - logger.warning("Got empty response from %s.", url) - return "" - - response_text = response.text - #logger.debug(response_text) - # - # html2text removes diacritics, therefore substitutions! - # - response_text = response_text.replace("Ä", "Ä").replace("ä", "ä") \ - .replace("Ö", "Ö").replace("ö", "ö").replace("ß", "ß") \ - .replace("Ü", "Ü").replace("ü", "ü").replace(""", 'QUOTQUOT') - txt = html2text.html2text(response_text) - # - # segmentation of text into smaller chunks; thus obtaining - # more concise ngram lists - # also detaching parentheses and quotes from enclosed text - # - txt = txt.replace("\n", " ").replace("*", "\n").replace('"', ' " ').replace('QUOTQUOT', ' " ')\ - .replace("[", "\n").replace("]", "\n").replace(")", " ) ").replace("!", "\n")\ - .replace("(", " ( ").replace(", ", " , ").replace(". ", "\n").replace("#", "\n")\ - .replace(";", "\n").replace("?", "\n").replace(": ", "\n").replace("|", "\n")\ - .replace("..", "\n").replace(" ", " ").replace(" ", " ").replace(" ", " ")\ - .replace(" ", " ").replace(" ( ) ", " ") - out = "" - # logger.debug(txt) - words = txt.split(" ") - for word in words: - word = word.strip() - if len(word) < 50: - if not ('\\' in word or '/' in word or '=' in word - or "www." in word or "%" in word): - out = out + " " + word - #logger.debug(out) - return out - - -def ngrams_url_dump(url: str, min_num_tokens: int, max_num_tokens: int) -> List[Tuple[int, str]]: - """ - Produces n-gram statistics from a given URL. - - If querying Bing, prefer ngrams_web_dump, which uses the Bing API if available. - - :param url: - :param min_num_tokens: - :param max_num_tokens: - :return: - """ - corpus = get_url_corpus(url) - return functions.corpus_to_ngram_list(corpus, min_num_tokens, max_num_tokens) diff --git a/acres/nn/__init__.py b/acres/word2vec/__init__.py similarity index 73% rename from acres/nn/__init__.py rename to acres/word2vec/__init__.py index badf214..483e710 100644 --- a/acres/nn/__init__.py +++ b/acres/word2vec/__init__.py @@ -1,6 +1,6 @@ """ Package grouping modules related to the word2vec expansion strategy. """ -from acres.nn import test, train +from acres.word2vec import test, train __all__ = ['test', 'train'] diff --git a/acres/nn/test.py b/acres/word2vec/test.py similarity index 100% rename from acres/nn/test.py rename to acres/word2vec/test.py diff --git a/acres/nn/train.py b/acres/word2vec/train.py similarity index 98% rename from acres/nn/train.py rename to acres/word2vec/train.py index c456926..12e7403 100644 --- a/acres/nn/train.py +++ b/acres/word2vec/train.py @@ -11,7 +11,7 @@ # from gensim.models import FastText from gensim.models.phrases import Phraser -from acres.ngram import ngrams +from acres.model import ngrams logger = logging.getLogger(__name__) diff --git a/config.ini.default b/config.ini.default index 09cb97b..7dcbf4e 100644 --- a/config.ini.default +++ b/config.ini.default @@ -1,18 +1,6 @@ [DEFAULT] CORPUS_PATH = tests/data -MORPH_ENG = tests/resources/lex.xml -MORPH_GER = tests/resources/lex.xml - -# Free trial at https://azure.microsoft.com/en-us/try/cognitive-services/?api=bing-web-search-api -BingSearchApiKey = %(BING_SEARCH_API_KEY)s # Number of partitions to split the fastNgram maps into. # A higher number of partitions is recommended for low-memory scenarios (e.g. 5 for 8 GB RAM). -FastNgramPartitions = 5 - -[proxy] -UseProxy = no -ProxyUser = DOMAIN\username -ProxyPass = secret -ProxyDomain = example.com -ProxyPort = 8080 \ No newline at end of file +FastNgramPartitions = 5 \ No newline at end of file diff --git a/docs/acres.evaluation.rst b/docs/acres.evaluation.rst index f807b47..aba6546 100644 --- a/docs/acres.evaluation.rst +++ b/docs/acres.evaluation.rst @@ -9,14 +9,6 @@ acres.evaluation package Submodules ---------- -acres.evaluation.corpus module ------------------------------- - -.. automodule:: acres.evaluation.corpus - :members: - :undoc-members: - :show-inheritance: - acres.evaluation.evaluation module ---------------------------------- diff --git a/docs/acres.fastngram.rst b/docs/acres.fastngram.rst new file mode 100644 index 0000000..644a0c5 --- /dev/null +++ b/docs/acres.fastngram.rst @@ -0,0 +1,19 @@ +acres.fastngram package +======================= + +.. automodule:: acres.fastngram + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +acres.fastngram.fastngram module +-------------------------------- + +.. automodule:: acres.fastngram.fastngram + :members: + :undoc-members: + :show-inheritance: + diff --git a/docs/acres.model.rst b/docs/acres.model.rst new file mode 100644 index 0000000..29ba232 --- /dev/null +++ b/docs/acres.model.rst @@ -0,0 +1,43 @@ +acres.model package +=================== + +.. automodule:: acres.model + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +acres.model.detection\_standard module +-------------------------------------- + +.. automodule:: acres.model.detection_standard + :members: + :undoc-members: + :show-inheritance: + +acres.model.expansion\_standard module +-------------------------------------- + +.. automodule:: acres.model.expansion_standard + :members: + :undoc-members: + :show-inheritance: + +acres.model.ngrams module +------------------------- + +.. automodule:: acres.model.ngrams + :members: + :undoc-members: + :show-inheritance: + +acres.model.topic\_list module +------------------------------ + +.. automodule:: acres.model.topic_list + :members: + :undoc-members: + :show-inheritance: + diff --git a/docs/acres.ngram.rst b/docs/acres.ngram.rst deleted file mode 100644 index 6ea577a..0000000 --- a/docs/acres.ngram.rst +++ /dev/null @@ -1,27 +0,0 @@ -acres.ngram package -=================== - -.. automodule:: acres.ngram - :members: - :undoc-members: - :show-inheritance: - -Submodules ----------- - -acres.ngram.finder module -------------------------- - -.. automodule:: acres.ngram.finder - :members: - :undoc-members: - :show-inheritance: - -acres.ngram.ngrams module -------------------------- - -.. automodule:: acres.ngram.ngrams - :members: - :undoc-members: - :show-inheritance: - diff --git a/docs/acres.nn.rst b/docs/acres.nn.rst deleted file mode 100644 index dd8cf11..0000000 --- a/docs/acres.nn.rst +++ /dev/null @@ -1,27 +0,0 @@ -acres.nn package -================ - -.. automodule:: acres.nn - :members: - :undoc-members: - :show-inheritance: - -Submodules ----------- - -acres.nn.test module --------------------- - -.. automodule:: acres.nn.test - :members: - :undoc-members: - :show-inheritance: - -acres.nn.train module ---------------------- - -.. automodule:: acres.nn.train - :members: - :undoc-members: - :show-inheritance: - diff --git a/docs/acres.resolution.rst b/docs/acres.resolution.rst new file mode 100644 index 0000000..fd4d78b --- /dev/null +++ b/docs/acres.resolution.rst @@ -0,0 +1,19 @@ +acres.resolution package +======================== + +.. automodule:: acres.resolution + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +acres.resolution.resolver module +-------------------------------- + +.. automodule:: acres.resolution.resolver + :members: + :undoc-members: + :show-inheritance: + diff --git a/docs/acres.rst b/docs/acres.rst index 546f60d..6957cac 100644 --- a/docs/acres.rst +++ b/docs/acres.rst @@ -12,15 +12,14 @@ Subpackages .. toctree:: acres.evaluation + acres.fastngram acres.model - acres.ngram - acres.nn acres.preprocess acres.rater acres.resolution acres.stats acres.util - acres.web + acres.word2vec Submodules ---------- diff --git a/docs/acres.util.rst b/docs/acres.util.rst index 6a09b6f..41defc5 100644 --- a/docs/acres.util.rst +++ b/docs/acres.util.rst @@ -33,11 +33,3 @@ acres.util.text module :undoc-members: :show-inheritance: -acres.util.variants module --------------------------- - -.. automodule:: acres.util.variants - :members: - :undoc-members: - :show-inheritance: - diff --git a/docs/acres.web.rst b/docs/acres.web.rst deleted file mode 100644 index 01df4fd..0000000 --- a/docs/acres.web.rst +++ /dev/null @@ -1,35 +0,0 @@ -acres.web package -================= - -.. automodule:: acres.web - :members: - :undoc-members: - :show-inheritance: - -Submodules ----------- - -acres.web.azure module ----------------------- - -.. automodule:: acres.web.azure - :members: - :undoc-members: - :show-inheritance: - -acres.web.base module ---------------------- - -.. automodule:: acres.web.base - :members: - :undoc-members: - :show-inheritance: - -acres.web.bing module ---------------------- - -.. automodule:: acres.web.bing - :members: - :undoc-members: - :show-inheritance: - diff --git a/docs/acres.word2vec.rst b/docs/acres.word2vec.rst new file mode 100644 index 0000000..57a9484 --- /dev/null +++ b/docs/acres.word2vec.rst @@ -0,0 +1,27 @@ +acres.word2vec package +====================== + +.. automodule:: acres.word2vec + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +acres.word2vec.test module +-------------------------- + +.. automodule:: acres.word2vec.test + :members: + :undoc-members: + :show-inheritance: + +acres.word2vec.train module +--------------------------- + +.. automodule:: acres.word2vec.train + :members: + :undoc-members: + :show-inheritance: + diff --git a/models/log/empty b/models/log/empty deleted file mode 100644 index e69de29..0000000 diff --git a/models/nn/empty b/models/nn/empty deleted file mode 100644 index e69de29..0000000 diff --git a/setup.py b/setup.py index 24de8df..b72cfc6 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ setup( name='acres', - version='0.1', + version='1.0', description='Acronym resolution', url='https://github.com/bst-mug/acres', author='Stefan Schulz', diff --git a/tests/benchmark/conftest.py b/tests/benchmark/conftest.py index ac75a69..94a5b5d 100644 --- a/tests/benchmark/conftest.py +++ b/tests/benchmark/conftest.py @@ -8,8 +8,7 @@ def path_resource_factory(): resource_factory.PICKLE_FOLDER = "models/pickle/" resource_factory.NGRAMS_FOLDER = "models/ngrams/" - resource_factory.LOG_FOLDER = "models/log/" - resource_factory.NN_MODELS_FOLDER = "models/nn/" + resource_factory.NN_MODELS_FOLDER = "models/word2vec/" resource_factory.DATA_FOLDER = functions.import_conf("CORPUS_PATH") resource_factory.reset() print("WARNING: Switched to real data.") diff --git a/tests/benchmark/test_benchmark.py b/tests/benchmark/test_benchmark.py index 0341fd5..f67a9b4 100644 --- a/tests/benchmark/test_benchmark.py +++ b/tests/benchmark/test_benchmark.py @@ -1,151 +1,14 @@ import pytest -from acres import constants -from acres.evaluation import evaluation -from acres.ngram import finder +from acres.evaluation import evaluation, metrics from acres.preprocess import resource_factory from acres.resolution import resolver -from acres.util import text def test_fixture(): assert "tests" not in resource_factory.PICKLE_FOLDER -def test_fix_line_endings(): - doc = """* Anamnese und klin. Symptomatik - -Die Vorstellung des Pat. in der EBA für Innere Medizin -erfolgte aufgrund einer schmerzhaften Schwellung des li. -Oberschenkels. Eine Eigenanamnese ist mit dem Pat. nicht erhebbar. Laut -telef. Rücksprache mit dem Heim ist kein Sturz erhebbar. - -* Physikalischer Status - -Temp 36,3°, RR initial 198/90 nach 2 Hb Nitro 165,82 -C: nc, arrh, leise -P iL bei < Compliance : VA -Abd.palp. unauff, lieg. PEG""" - - expected = """* Anamnese und klin. Symptomatik - -Die Vorstellung des Pat. in der EBA für Innere Medizin erfolgte aufgrund einer schmerzhaften Schwellung des li. Oberschenkels. Eine Eigenanamnese ist mit dem Pat. nicht erhebbar. Laut telef. Rücksprache mit dem Heim ist kein Sturz erhebbar. - -* Physikalischer Status - -Temp 36,3°, RR initial 198/90 nach 2 Hb Nitro 165,82 -C: nc, arrh, leise -P iL bei < Compliance : VA -Abd.palp. unauff, lieg. PEG -""" - - # Save and restore line_break - old_line_break = constants.LINE_BREAK - constants.LINE_BREAK = "\n" - - actual = text.fix_line_endings(doc) - - constants.LINE_BREAK = old_line_break - - assert expected == actual - - -def test_find_embeddings(): - finder_constraints = finder.FinderConstraints(min_freq=2, max_count=500, min_num_tokens=1, - max_num_tokens=10) - actual = finder.find_embeddings("nach", "ICD", "Implantation", finder_constraints) - expected = [(273, 'CRT ICD'), (221, 'prophylaktischer CRT ICD'), (257, 'prophylaktischer ICD')] - assert set(expected).issubset(actual) - - actual = finder.find_embeddings("", "HF-Anstieg", "von", finder_constraints) - expected = [] - assert set(expected).issubset(actual) - - actual = finder.find_embeddings("", "HT", "rein", finder_constraints) - expected = [] - assert set(expected).issubset(actual) - - # viele Treffer, die mit Ht anfangen - actual = finder.find_embeddings("geplanten", "EPU", "*", finder_constraints) - expected = [] - assert set(expected).issubset(actual) - - actual = finder.find_embeddings("einem", "EDP", "von", finder_constraints) - expected = [(1741, 'max Gradienten'), (710, 'mittleren Gradienten'), (325, 'LVEDD')] - assert set(expected).issubset(actual) - - # wird nicht gefunden - stricter_max_count = finder.FinderConstraints(min_freq=2, max_count=100, min_num_tokens=1, - max_num_tokens=10) - actual = finder.find_embeddings("gutem", "AZ", "nach", stricter_max_count) - expected = [(312, 'AZ und mit blander Punktionsstelle'), (277, 'AZ wieder'), - (141, 'AZ und bei blander Punktionsstelle')] - assert set(expected).issubset(actual) - - actual = finder.find_embeddings("die", "VCS.", "", finder_constraints) - expected = [] - assert set(expected).issubset(actual) - - actual = finder.find_embeddings("", "DG's", "", finder_constraints) - expected = [] - assert set(expected).issubset(actual) - - # only without restricted context the resolution is found - # only DG's resolved, not DGs - actual = finder.find_embeddings("die", "VCS.", "", finder_constraints) - expected = [] - assert set(expected).issubset(actual) - - # only works with final dot! - actual = finder.find_embeddings("re", "OL", "", finder_constraints) - expected = [] - assert set(expected).issubset(actual) - - actual = finder.find_embeddings("die", "VCS", "", - finder.FinderConstraints(min_freq=1, max_count=100, - min_num_tokens=1, max_num_tokens=10)) - expected = [] - assert set(expected).issubset(actual) - - # Code originally commented out below # - - actual = finder.find_embeddings("", "morph.", "", - finder.FinderConstraints(min_freq=3, max_count=1000, - min_num_tokens=1, max_num_tokens=7)) - expected = [] - assert set(expected).issubset(actual) - - actual = finder.find_embeddings("Mitralklappe", "morph.", "*", - finder.FinderConstraints(min_freq=3, max_count=1000, - min_num_tokens=1, max_num_tokens=7)) - expected = [] - assert set(expected).issubset(actual) - - actual = finder.find_embeddings("", "morph.", "", - finder.FinderConstraints(min_freq=3, max_count=1000, - min_num_tokens=1, max_num_tokens=1)) - expected = [] - assert set(expected).issubset(actual) - - actual = finder.find_embeddings("", "morph.", "unauff.", - finder.FinderConstraints(min_freq=3, max_count=1000, - min_num_tokens=3, max_num_tokens=7)) - expected = [] - assert set(expected).issubset(actual) - - actual = finder.find_embeddings("", "ms", "", - finder.FinderConstraints(min_freq=30, max_count=500, - min_num_tokens=1, max_num_tokens=5)) - expected = [] - assert set(expected).issubset(actual) - - actual = finder.find_embeddings("Ð,Ð", "ms", "", - finder.FinderConstraints(min_freq=3, max_count=500, - min_num_tokens=1, max_num_tokens=7)) - expected = [] - assert set(expected).issubset(actual) - - def test_get_word_ngrams(): ngrams = resource_factory.get_word_ngrams().keys() unique_ngrams = set(ngrams) @@ -160,21 +23,27 @@ def test_get_word_ngrams(): def test_evaluation(): # XXX word2vec is not deterministic, different models might lead to slighthly different metrics - (precision, recall) = evaluation.do_analysis("resources/stefan_topic_list.tsv", + correct, found, valid = evaluation.do_analysis("resources/stefan_topic_list.tsv", "resources/detection_standard.tsv", "resources/expansion_standard.tsv", resolver.Strategy.WORD2VEC, evaluation.Level.TYPE, 10, True) + precision = metrics.calculate_precision(len(correct), len(found)) + recall = metrics.calculate_recall(len(correct), len(valid)) + absolute_tolerance = 0.02 - assert precision == pytest.approx(0.81, abs=absolute_tolerance) - assert recall == pytest.approx(0.68, abs=absolute_tolerance) + assert pytest.approx(0.81, abs=absolute_tolerance) == precision + assert pytest.approx(0.68, abs=absolute_tolerance) == recall - (precision, recall) = evaluation.do_analysis("resources/stefan_topic_list.tsv", + correct, found, valid = evaluation.do_analysis("resources/stefan_topic_list.tsv", "resources/detection_standard.tsv", "resources/expansion_standard.tsv", resolver.Strategy.DICTIONARY, evaluation.Level.TYPE, 10, True) + precision = metrics.calculate_precision(len(correct), len(found)) + recall = metrics.calculate_recall(len(correct), len(valid)) + absolute_tolerance = 0.02 - assert precision == pytest.approx(0.81, abs=absolute_tolerance) - assert recall == pytest.approx(0.49, abs=absolute_tolerance) + assert pytest.approx(0.81, abs=absolute_tolerance) == precision + assert pytest.approx(0.49, abs=absolute_tolerance) == recall diff --git a/tests/conftest.py b/tests/conftest.py index 73897e0..f346858 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -2,15 +2,14 @@ import pytest -from acres.nn import train +from acres.word2vec import train from acres.preprocess import resource_factory @pytest.fixture(scope="module", autouse=True) def delete_models(): - _delete_contents("tests/models/log") _delete_contents("tests/models/ngrams") - _delete_contents("tests/models/nn") + _delete_contents("tests/models/word2vec") _delete_contents("tests/models/pickle") @@ -22,8 +21,7 @@ def _delete_contents(folder): def path_resource_factory(): resource_factory.PICKLE_FOLDER = "tests/models/pickle/" resource_factory.NGRAMS_FOLDER = "tests/models/ngrams/" - resource_factory.LOG_FOLDER = "tests/models/log/" - resource_factory.NN_MODELS_FOLDER = "tests/models/nn/" + resource_factory.NN_MODELS_FOLDER = "tests/models/word2vec/" resource_factory.DATA_FOLDER = "tests/data" resource_factory.reset() print("INFO: Switched to test data.") @@ -72,25 +70,3 @@ def word_ngrams(): # Teardown: revert back to old resource_factory.WORD_NGRAMS = old - - -@pytest.fixture(scope="module") -def index(): - # Setup: save current one and assign a fake one - old = resource_factory.INDEX - resource_factory.INDEX = {"¶": {1, 4, 6}, - "der": {2}, - "EKG": {3, 4, 5}, - "*": {4, 6}, - "Im": {5}, - "Physikalischer": {6}, - "Status": {6}, - "for": {7, 8}, - "WORD": {7}, - "embeddings": {7, 8}, - "WabcOabcRabcDabc": {8} - } - yield resource_factory.INDEX - - # Teardown: revert back to old - resource_factory.INDEX = old \ No newline at end of file diff --git a/tests/evaluation/test_corpus.py b/tests/evaluation/test_corpus.py deleted file mode 100644 index b3aaab5..0000000 --- a/tests/evaluation/test_corpus.py +++ /dev/null @@ -1,14 +0,0 @@ -import os.path - -from acres.evaluation import corpus - - -def test_find_synonyms(ngramstat): - output_file = "tests/models/log/logWebs.txt" - - if os.path.isfile(output_file): - os.remove(output_file) - assert not os.path.isfile(output_file) - - corpus.find_synonyms() - assert os.path.isfile(output_file) diff --git a/tests/ngram/test_ngrams.py b/tests/model/test_ngrams.py similarity index 92% rename from tests/ngram/test_ngrams.py rename to tests/model/test_ngrams.py index 12281c8..abf2d58 100644 --- a/tests/ngram/test_ngrams.py +++ b/tests/model/test_ngrams.py @@ -1,6 +1,6 @@ from itertools import islice -from acres.ngram import ngrams +from acres.model import ngrams from acres.util.acronym import Acronym diff --git a/tests/ngram/test_finder.py b/tests/ngram/test_finder.py deleted file mode 100644 index 02eaf8b..0000000 --- a/tests/ngram/test_finder.py +++ /dev/null @@ -1,56 +0,0 @@ -from acres.ngram import finder - - -def test__build_search_ngrams(): - context = "a b c d" - - expected = ("a", "a b", "a b c") - actual = finder._build_search_ngrams(context) - assert expected == actual - - expected = ("d", "c d", "b c d") - actual = finder._build_search_ngrams(context, True) - assert expected == actual - - -def test__strip_frequencies(): - embeddings = [(42, "Abc Def, xyz"), (10, "aaaaaaa")] - - expected = ["Abc Def, xyz", "aaaaaaa"] - actual = finder._strip_frequencies(embeddings) - - assert expected == actual - - -def test_robust_find_embeddings(ngramstat, index): - actual = finder.robust_find_embeddings("EKG", "", "") - expected = ['Physikalischer Status'] - assert set(expected).issubset(actual) - - -def test_find_embeddings(ngramstat, index): - finder_constraints = finder.FinderConstraints(min_freq=1, max_count=100, min_num_tokens=1, - max_num_tokens=5) - - # Explicit context - actual = finder.find_embeddings("", "EKG", "¶", finder_constraints) - expected = [(19, 'Physikalischer Status')] - assert set(expected).issubset(actual) - - # Relax right context - actual = finder.find_embeddings("", "EKG", "", finder_constraints) - expected = [(19, 'Physikalischer Status')] - assert set(expected).issubset(actual) - - # Relax left and right contexts - actual = finder.find_embeddings("", "EKG", "", finder_constraints) - expected = [(19, 'Physikalischer Status')] - assert set(expected).issubset(actual) - - # Changing min_num_tokens should restrict results - finder_constraints = finder.FinderConstraints(min_freq=1, max_count=100, min_num_tokens=28, - max_num_tokens=5) - # "* EKG ¶", the only valid embedding, happens 27 times - actual = finder.find_embeddings("", "EKG", "", finder_constraints) - expected = [(19, 'Physikalischer Status')] - assert not set(expected).issubset(actual) \ No newline at end of file diff --git a/tests/nn/__init__.py b/tests/nn/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/tests/preprocess/test_dumps.py b/tests/preprocess/test_dumps.py index 41086d5..e51184d 100644 --- a/tests/preprocess/test_dumps.py +++ b/tests/preprocess/test_dumps.py @@ -1,32 +1,17 @@ -from acres.preprocess import dumps, resource_factory - - -def test_create_morpho_dump(): - actual = dumps.create_morpho_dump("tests/resources/lex.xml") - expected = {'gramm', 'nieren', 'herc', 'crancheit', 'cardio', 'arbeits', 'el', 'cammer', 'electro', 'coronar'} - - assert expected.issubset(actual) - - -def test_create_corpus_char_stat_dump(): - char_ngrams = dumps.create_corpus_char_stat_dump("tests/data") - - actual = len(char_ngrams) - expected = 86187 - assert expected == actual +from acres.preprocess import dumps def test_create_corpus_ngramstat_dump(): ngramstat = dumps.create_corpus_ngramstat_dump("tests/data", 100) actual = len(ngramstat) - expected = 29 + expected = 24 assert expected == actual ngramstat = dumps.create_corpus_ngramstat_dump("tests/data", 2) # Check length actual = len(ngramstat) - expected = 18299 + expected = 15860 assert expected == actual # Baseline @@ -42,26 +27,3 @@ def test_create_corpus_ngramstat_dump(): # ...nor duplicate entries assert len(unique_ngrams) == len(ngrams) - - -def test_create_index(ngramstat, index): - actual = dumps.create_index(ngramstat) - expected = index - - # Dictionary comparison - for key, value in expected.items(): - assert value == actual[key] - - -def test_create_acro_dump(ngramstat): - actual = dumps.create_acro_dump() - expected = ['EKG'] - - assert expected == actual - - -def test_create_new_acro_dump(ngramstat): - actual = dumps.create_new_acro_dump() - expected = ['Im EKG'] - - assert set(expected).issubset(actual) diff --git a/tests/preprocess/test_resource_factory.py b/tests/preprocess/test_resource_factory.py index 1507ef3..0ea4893 100644 --- a/tests/preprocess/test_resource_factory.py +++ b/tests/preprocess/test_resource_factory.py @@ -8,36 +8,6 @@ def test_fixture(): assert "tests" in resource_factory.PICKLE_FOLDER -def test_get_morphemes(): - output_file = "tests/models/pickle/morphemes.p" - - if os.path.isfile(output_file): - os.remove(output_file) - assert not os.path.isfile(output_file) - - # Forces recreation - resource_factory.MORPHEMES = [] - - resource_factory.get_morphemes() - assert os.path.isfile(output_file) - - -def test_getindex(ngramstat): - resource_factory.MIN_FREQ = 1 - - output_file = "tests/models/pickle/index-1-" + resource_factory.VERSION + ".p" - - if os.path.isfile(output_file): - os.remove(output_file) - assert not os.path.isfile(output_file) - - # Forces recreation - resource_factory.INDEX = [] - - resource_factory.get_index() - assert os.path.isfile(output_file) - - def test_get_ngramstat(monkeypatch, ngramstat): # Monkey patch create_indexed_ngrams so that it returns the fake ngramstat def mockreturn(word_ngrams): @@ -57,44 +27,3 @@ def mockreturn(word_ngrams): resource_factory.get_ngramstat() assert os.path.isfile(output_file) - - -def test_get_acronym_ngrams(monkeypatch): - # Monkey patch create_new_acro_dump so that tests do not depend on all acronyms - def mockreturn(): - return ["EKG", "AP"] - monkeypatch.setattr(dumps, "create_new_acro_dump", mockreturn) - - output_file = "tests/models/pickle/acronymNgrams.p" - - if os.path.isfile(output_file): - os.remove(output_file) - assert not os.path.isfile(output_file) - - resource_factory.get_acronym_ngrams() - assert os.path.isfile(output_file) - - -def test_get_acronyms(): - output_file = "tests/models/pickle/acronyms.p" - - if os.path.isfile(output_file): - os.remove(output_file) - assert not os.path.isfile(output_file) - - resource_factory.get_acronyms() - assert os.path.isfile(output_file) - - -def test_get_character_ngrams(): - output_file = "tests/models/pickle/character_ngrams.p" - - if os.path.isfile(output_file): - os.remove(output_file) - assert not os.path.isfile(output_file) - - # Forces recreation - resource_factory.CHARACTER_NGRAMS = {} - - resource_factory.get_character_ngrams() - assert os.path.isfile(output_file) diff --git a/tests/rater/test_rater.py b/tests/rater/test_rater.py index bedc138..c51884f 100644 --- a/tests/rater/test_rater.py +++ b/tests/rater/test_rater.py @@ -43,26 +43,3 @@ def test_get_acronym_score(): # TODO Wrong #assert rater.get_acronym_score("SR", "Sinusrythmus") > rater.get_acronym_score("SR", "Sinusarrhythmie") - - -def test_get_acronym_score_variants(): - # Acronyms created out of spelling variants are accepted - assert 1.0 == rater.get_acronym_score_variants("AK", "Arbeitskammer") - assert 1.0 == rater.get_acronym_score_variants("AC", "Arbeitskammer") - - # But not the opposite! - # TODO Is is expected? - assert 0.0 == rater.get_acronym_score_variants("AK", "Arbeitscammer") - - # Score of the best variant should be preserved - assert 2.0 == rater.get_acronym_score_variants("AK", "Arbeits Kranker") # sic - - # Acronyms with only plural letters should not cause IndexError - assert 0 == rater.get_acronym_score_variants("SS", "Überprüfen Sie die") - - -def test_get_acronym_definition_pair_score(): - assert 10 == rater.get_acro_def_pair_score("EKG", "EKG (Elektrokardiogramm)")[1] - - # FIXME Does not work - #assert 10 == rater.get_acronym_definition_pair_score("ARDS", "ARDS (akutes Atemnotsyndrom)")[1] \ No newline at end of file diff --git a/tests/test_config.py b/tests/test_config.py index cb073b5..409eca6 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -16,7 +16,3 @@ def test_config(): assert my_config[section] for option in default_config.options(section): assert my_config.get(section, option) - - # Addtional checks - assert os.path.isfile(my_config["DEFAULT"]["MORPH_ENG"]) - assert os.path.isfile(my_config["DEFAULT"]["MORPH_GER"]) diff --git a/tests/util/test_acronym.py b/tests/util/test_acronym.py index 2132ecd..b0a6bca 100644 --- a/tests/util/test_acronym.py +++ b/tests/util/test_acronym.py @@ -1,16 +1,6 @@ from acres.util import acronym -def test_extract_acronym_definition(): - max_length = 7 - - expected = ('EKG', 'Elektrokardiogramm') - assert expected == acronym.extract_acronym_definition("EKG (Elektrokardiogramm)", max_length) - assert expected == acronym.extract_acronym_definition("Elektrokardiogramm (EKG)", max_length) - - assert acronym.extract_acronym_definition("Elektrokardiogramm", max_length) is None - - def test_is_acronym(): # Single digits are not acronyms assert not acronym.is_acronym("A", 3) @@ -66,110 +56,6 @@ def test_create_german_acronym(): # assert expected == actual -def test_is_proper_word(): - # Baseline - assert True == acronym.is_proper_word("Elektrokardiogramm") - - # Too short - assert False == acronym.is_proper_word("A") - - # Dashes are allowed in the middle - assert False == acronym.is_proper_word("-abc-") - assert True == acronym.is_proper_word("abc-def") - assert False == acronym.is_proper_word("abc/def") - - # Must have proper case - assert False == acronym.is_proper_word("BEFUND") - - -def test_split_expansion(): - # Baseline - expected = [('Elektro', 'kardio', 'gramm'), - ('Ele', 'ktrokardio', 'gramm')] - actual = acronym.split_expansion("EKG", "Elektrokardiogramm") - assert expected == actual - - # Single letter acronyms, even though not valid, should not break - expected = [('a')] - actual = acronym.split_expansion("A", "a") - assert expected == actual - - # Expansion = acronym should still work - expected = [('a', 'b')] - actual = acronym.split_expansion("AB", "ab") - assert expected == actual - - expected = [('a', 'b', 'c')] - actual = acronym.split_expansion("ABC", "abc") - assert expected == actual - - expected = [('a', 'b', 'c', 'd', 'e')] - actual = acronym.split_expansion("ABCDE", "abcde") - assert expected == actual - - # Unexpected splits - expected = [('T', 'rikuspidal', 'i', 'n', 'suffizienz'), # Correct one - ('T', 'r', 'ikuspidali', 'n', 'suffizienz')] - actual = acronym.split_expansion("TRINS", "Trikuspidalinsuffizienz") - assert expected == actual - - # No valid expansion should return empty - expected = [] - actual = acronym.split_expansion("EKG", "Elektro") - assert expected == actual - - # Underscore and dashes - assert [("Kardio", "gra mm")] == acronym.split_expansion("KG", "Kardiogra_mm") - assert [] == acronym.split_expansion("KG", "Kardiogra-mm") - - # Whitespaces - assert [("Kardio", "gra mm")] == acronym.split_expansion("KG", "Kardiogra mm") - assert [("Kardio", "gra mm")] == acronym.split_expansion("KG", "Kardiogra\tmm") - - # Digit marker - assert [("Kardio", "graÐmm")] == acronym.split_expansion("KG", "KardiograÐmm") - - # Symbols - assert [("Kardio", "gra mm")] == acronym.split_expansion("KG", "Kardiogra&mm") - - # FIXME Very poor performance - # expected = [] - # actual = acronym.split_expansion('ACE-Hemmerunverträglichkeit', 'ACEHemmerunverträglichkeit') - - -def test__acronym_aware_clean_expansion(): - # Baseline: return expansion if no symbols are found - assert "Elektrokardiogramm" == acronym._acronym_aware_clean_expansion("EKG", "Elektrokardiogramm") - - # We should clean symbols, unless they appear in the acronym itself - assert "Angina pectoris" == acronym._acronym_aware_clean_expansion("AP", "Angina&pectoris") - assert "Angina&pectoris" == acronym._acronym_aware_clean_expansion("A&P", "Angina&pectoris") - - # We should preserve spaces, happening or not in the acronym itself - assert "Angina pectoris" == acronym._acronym_aware_clean_expansion("AP", "Angina pectoris") - assert "Angina pectoris" == acronym._acronym_aware_clean_expansion("A P", "Angina pectoris") - - # We should preserve hyphens, happening or not in the acronym itself - assert "Angina-pectoris" == acronym._acronym_aware_clean_expansion("AP", "Angina-pectoris") - assert "Angina-pectoris" == acronym._acronym_aware_clean_expansion("A-P", "Angina-pectoris") - - # We strip the output even if the acronym itself is stripped - assert "Angina pectoris" == acronym._acronym_aware_clean_expansion(" AP ", " Angina pectoris ") - - # XXX We do not remove duplicated spaces - assert "Angina pectoris" == acronym._acronym_aware_clean_expansion("AP", "Angina&&&pectoris") - - -def test_split_ngram(): - assert [] == acronym.split_ngram("a b c") - # - assert [('a', 'AK', 'b')] == acronym.split_ngram("a AK b") - # - assert [('l', 'ACR1', 'b ACR2 c'), ('l ACR1 b', 'ACR2', 'c')] == acronym.split_ngram("l ACR1 b ACR2 c") - # - assert [('', 'ACR', '')] == acronym.split_ngram("ACR") - - def test_trim_plural(): # Baseline assert "EKG" == acronym.trim_plural("EKGs") diff --git a/tests/util/test_functions.py b/tests/util/test_functions.py index 341f6bf..b8786e7 100644 --- a/tests/util/test_functions.py +++ b/tests/util/test_functions.py @@ -8,15 +8,6 @@ def test_import_conf(): assert isinstance(corpus_path, str) -def test_get_url(): - url = "https://github.com/bst-mug/acres" - actual = functions.get_url(url) - - # 200 means OK - assert 200 == actual.status_code - assert "Acronym" in actual.text - - def test_create_ngram_statistics(): assert functions.create_ngram_statistics('a', 1, 1) == {'a': 1} assert functions.create_ngram_statistics('a b', 1, 1) == {'a': 1, 'b': 1} @@ -51,16 +42,6 @@ def test_create_ngram_statistics(): assert expected == actual -def test_random_sub_list(): - # We output the input list if the length requested is larger or equal to - # the input length - assert functions.random_sub_list(["a", "b"], 2) == ["a", "b"] - assert functions.random_sub_list(["a", "b"], 3) == ["a", "b"] - - # TODO use Random.seed() so that the output is deterministic - assert functions.random_sub_list(["a", "b"], 1) in [["a"], ["b"]] - - def test_robust_text_import_from_dir(): actual = functions.robust_text_import_from_dir("tests/data") print(str(len(actual))) diff --git a/tests/util/test_text.py b/tests/util/test_text.py index d4ec002..f6a15e5 100644 --- a/tests/util/test_text.py +++ b/tests/util/test_text.py @@ -1,49 +1,7 @@ import acres.util -from acres import constants from acres.util import text -def test_fix_line_endings(): - expected = "This is a short text¶" - actual = acres.util.text.fix_line_endings("This is a short text") - assert expected == actual - - expected = "der Patientin¶" - actual = acres.util.text.fix_line_endings("der\nPatientin") - assert expected == actual - - expected = "DIAGNOSEN¶---------¶" - actual = acres.util.text.fix_line_endings("DIAGNOSEN\n---------") - assert expected == actual - - -def test_replace_punctuation(): - expected = "life changing EKG" - actual = acres.util.text.replace_punctuation("life-changing. EKG. ") - assert expected == actual - - -def test_transliterate_to_seven_bit(): - constants.LANGUAGE = "en" - assert "HATTE" == acres.util.text.transliterate_to_seven_bit("hätte") - assert "ANGSTROM" == acres.util.text.transliterate_to_seven_bit("ångström") - - constants.LANGUAGE = "de" - assert "HAETTE" == acres.util.text.transliterate_to_seven_bit("hätte") - assert "AANGSTROEM" == acres.util.text.transliterate_to_seven_bit("ångström") - - -def test_context_ngram(): - ngram = "A B C D E F" - assert text.context_ngram(ngram, 3, True) == "D E F" - assert text.context_ngram(ngram, 1, True) == "F" - assert text.context_ngram(ngram, 0, True) == "" - - assert text.context_ngram(ngram, 3, False) == "A B C" - assert text.context_ngram(ngram, 1, False) == "A" - assert text.context_ngram(ngram, 0, False) == "" - - def test_remove_duplicated_whitespaces(): expected = "abc def ghi z" actual = acres.util.text.remove_duplicated_whitespaces("abc def ghi z") diff --git a/tests/util/test_variants.py b/tests/util/test_variants.py deleted file mode 100644 index d448960..0000000 --- a/tests/util/test_variants.py +++ /dev/null @@ -1,51 +0,0 @@ -from acres.util import variants - - -def test__resolve_ambiguous_lists(): - expected = [['cyclophospham', ('id', 'ide')], ['cyclophospham', 'id'], ['cyclophospham', 'ide']] - actual = variants._resolve_ambiguous_lists([['cyclophospham', ('id', 'ide')]]) - assert expected == actual - - -def test__create_string_variants_as_list(): - expected = ['cyclophospham', ('id', 'ide')] - actual = variants._create_string_variants_as_list("cyclophosphamid", "id", "ide") - assert expected == actual - - # Empty search returns the imput string as a list - expected = ['cyclophosphamid'] - actual = variants._create_string_variants_as_list("cyclophosphamid", "", "ide") - assert expected == actual - - -def test__list_to_string(): - expected = "abc" - actual = variants._list_to_string(["a", "b", "c"]) - assert expected == actual - - # Non-string element returns empty - expected = "" - actual = variants._list_to_string(["a", "b", "c", ("g", "e")]) - assert expected == actual - - -def test__list_all_string_variants(): - expected = ['cyclophosphamid', 'cyclophosphamide'] - actual = variants._list_all_string_variants("cyclophosphamid", "id", "ide") - assert expected == actual - - -def test_generate_all_variants_by_rules(): - expected = ['Arterielle Verschlusskrankheit', 'Arterielle Verschluss Disorder', - 'Arterielle Verschlusscrankheit'] - actual = variants.generate_all_variants_by_rules("Arterielle Verschlusskrankheit") - assert expected == actual - - expected = ["elektrokardiogramm", "elektrocardiogramm"] - assert expected == variants.generate_all_variants_by_rules("elektrokardiogramm") - - # TODO "esophagus" is never mentioned in the corpus, while "oesophagus" is. - #assert ["ösophagus", "oesophagus"] == variants.generate_all_variants_by_rules("ösophagus") - assert ["herz"] == variants.generate_all_variants_by_rules("herz") - assert ["café"] == variants.generate_all_variants_by_rules("café") # TODO add cafe - assert ["à"] == variants.generate_all_variants_by_rules("à") # TODO add a diff --git a/tests/web/__init__.py b/tests/web/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/tests/web/conftest.py b/tests/web/conftest.py deleted file mode 100644 index 3b9c4ad..0000000 --- a/tests/web/conftest.py +++ /dev/null @@ -1,34 +0,0 @@ -import pytest -import json - -from acres.util import functions - - -@pytest.fixture() -def mock_get_url(monkeypatch): - """ - Monketpatch functions.get_url so that it returns a fake web response. Adapted from: - https://www.angelaambroz.com/blog/posts/2018/Jan/24/a_few_of_my_favorite_pytest_things/ - Licensed under CC BY-SA 4.0 - - :param monkeypatch: - :return: - """ - - def mock_url(query, headers, params): - class FakeResponse(object): - def __init__(self): - self.headers = "" - self.text = '''{"webPages": {"value": [ - { - "name": "Elektrokardiogramm \u2013 Wikipedia", - "url": "", - "language": "", - "snippet": "" - } - ]}}''' - - def json(self): - return json.loads(self.text) - return FakeResponse() - monkeypatch.setattr(functions, "get_url", mock_url) \ No newline at end of file diff --git a/tests/web/test_azure.py b/tests/web/test_azure.py deleted file mode 100644 index 9699a09..0000000 --- a/tests/web/test_azure.py +++ /dev/null @@ -1,41 +0,0 @@ -from acres.web import azure - -# TODO use fixture for tests instead of committed azure.p - - -def test_get_web_corpus(): - expected = "" - actual = azure.get_web_corpus("akjkjgehwvmewm") - assert expected == actual - - expected = "Elektrokardiogramm" - actual = azure.get_web_corpus("EKG") - assert expected in actual - - -def test_cached_get_web_results(): - expected = "Elektrokardiogramm – Wikipedia" - actual = azure.cached_get_web_results("EKG") - assert expected in [result.name for result in actual] - - # Queries without valid results must not fail - expected = None - actual = azure.cached_get_web_results("akjkjgehwvmewm") - assert expected == actual - - -def test_get_web_results(monkeypatch, mock_get_url): - # Monkey patch is_valid_key so that it's always valid - def valid_key(): - return True - monkeypatch.setattr(azure, "is_valid_key", valid_key) - - expected = "Elektrokardiogramm – Wikipedia" - actual = azure.get_web_results("EKG") - assert expected in [result.name for result in actual] - - -def test_is_valid_key(): - # Environment variables are not available on GitHub pull requests - # assert azure.is_valid_key() - pass diff --git a/tests/web/test_base.py b/tests/web/test_base.py deleted file mode 100644 index 8d2f9d9..0000000 --- a/tests/web/test_base.py +++ /dev/null @@ -1,21 +0,0 @@ -from acres.web import base, azure, bing - - -def test_get_best_acronym_web_resolution(monkeypatch): - # Monkey patch get_web_corpus so that tests do not depend on web results - def mockreturn(query): - if query == "EKG": - return "Elektrokardiogramm – Wikipedia Das Elektrokardiogramm (EKG)" - if query == "EKG ccc ddd bbb eee aaa": - return "Elektrokardiogramm Elektrokardiogramm Elektrokardiogramm" - - monkeypatch.setattr(azure, "get_web_corpus", mockreturn) - monkeypatch.setattr(bing, "get_web_corpus", mockreturn) - - expected = ("Elektrokardiogramm", 10) - actual = base.get_best_acronym_web_resolution("", "EKG", "", 3, 5) - assert expected == actual - - expected = ("Elektrokardiogramm", 1) - actual = base.get_best_acronym_web_resolution("aaa bbb ccc", "EKG", "ddd eee fff", 3, 5) - assert expected == actual diff --git a/tests/web/test_bing.py b/tests/web/test_bing.py deleted file mode 100644 index af2ea69..0000000 --- a/tests/web/test_bing.py +++ /dev/null @@ -1,17 +0,0 @@ -def test_ngrams_web_dump(): - pass - - """ - # FIXME Flaky test in different geographic regions - - # A bing query for "EKG" should retrieve a high frequency of the word "Elektrokardiogramm! - - acronym = "EKG" - hit = False - lst_result = bing.ngrams_url_dump("http://www.bing.de/search?cc=de&q=%22" + acronym + "%22", 1, 10) - for i in range(1, 30): - (freq, exp) = lst_result[i] - if "Elektrokardiogram" in exp: - hit = True - assert hit - """ \ No newline at end of file diff --git a/tests/ngram/__init__.py b/tests/word2vec/__init__.py similarity index 100% rename from tests/ngram/__init__.py rename to tests/word2vec/__init__.py diff --git a/tests/nn/test_nn.py b/tests/word2vec/test_word2vec.py similarity index 85% rename from tests/nn/test_nn.py rename to tests/word2vec/test_word2vec.py index bb6d6f7..e7cbbf4 100644 --- a/tests/nn/test_nn.py +++ b/tests/word2vec/test_word2vec.py @@ -1,6 +1,6 @@ from itertools import islice -from acres.nn import test +from acres.word2vec import test def test_nn(ngramstat):