config.cf.default

# Specify format for the log outputs
[logformat]
filename = msgs.log
datefmt = %%Y-%%d-%%m %%H:%%M:%%S
file_format= %%(asctime)s | %%(levelname)-8s | %%(message)s
file_level = INFO
cons_level = DEBUG
cons_format = %%(levelname)-8s | %%(message)s

[Spark]
spark_available = True
machines = 10
cores = 4
script_spark = /export/usuarios_ml4ds/jarenas/script-spark/script-spark
token_spark = /export/usuarios_ml4ds/jarenas/script-spark/tokencluster.json

[Dask]
num_workers = 0

[HDFS]
#This paths are specific to UC3M deployment
Semantic Scholar = /export/ml4ds/IntelComp/Datalake/SemanticScholar/20220201/papers.parquet
PATSTAT = /export/ml4ds/IntelComp/Datalake/PATSTAT/2022_Spring/patstat_appln.parquet
CORDIS = /export/ml4ds/IntelComp/Datalake/CORDIS/20220221/new_parquet/projects.parquet

[Preproc]
#Minimum number of words to keep document in corpus
min_lemas = 15
#Remove words with less than no_below occurrences
no_below=10
#Remove words appearing in more than a given percentage of documents
no_above=0.6
#Maximum number of words in vocabulary
keep_n=500000

[TM]
#Default setting for number of topics
ntopics=25
thetas_thr=3e-3

[MalletTM]
#Path to mallet binary
mallet_path=/export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/src/topicmodeling/mallet-2.0.8/bin/mallet
#Regular expression for token identification
token_regexp=[\p{L}\p{N}][\p{L}\p{N}\p{P}]*\p{L}
#Settings for mallet training and doctopics postprocessing
alpha=5
optimize_interval=10
num_threads=4
num_iterations=1000
doc_topic_thr=0
num_iterations_inf=100

[SparkLDA]
alpha=5
maxIterations=20
#Supported values for optimizer are 'em' and 'online'
optimizer=online
optimizeDocConcentration=True
subsamplingRate=0.05

[ProdLDA]
model_type=prodLDA
hidden_sizes=(100,100)
activation=softplus
dropout=0.2
learn_priors=True
lr=2e-3
momentum=0.99
solver=adam
num_epochs=100
reduce_on_plateau=False
batch_size=64
topic_prior_mean=0.0
topic_prior_variance=None
num_samples=10
num_data_loader_workers=0

[CTM]
model_type=prodLDA
ctm_model_type=CombinedTM
hidden_sizes=(100,100)
activation=softplus
dropout=0.2
learn_priors=True
batch_size=64
lr=2e-3
momentum=0.99
solver=adam
num_epochs=100
num_samples=10
reduce_on_plateau=False
topic_prior_mean=0.0
topic_prior_variance=None
num_data_loader_workers=0
label_size=0
loss_weights=None
sbert_model_to_load=paraphrase-distilroberta-base-v1
contextual_size=768

[bertopic]
no_below=1
no_above=1
get_sims=False
sbert_model=paraphrase-distilroberta-base-v2
umap_n_components=5
umap_n_neighbors=15
umap_min_dist=0.0
umap_metric=cosine
hdbscan_min_cluster_size=10
hdbscan_metric=euclidean
hdbscan_cluster_selection_method=eom
hbdsan_prediction_data=True

[Hierarchical]
expansion_tpc=0
htm_version=htm-ds
thr=0.2