-
Notifications
You must be signed in to change notification settings - Fork 0
/
config.cf.default
118 lines (107 loc) · 2.67 KB
/
config.cf.default
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
# Specify format for the log outputs
[logformat]
filename = msgs.log
datefmt = %%Y-%%d-%%m %%H:%%M:%%S
file_format= %%(asctime)s | %%(levelname)-8s | %%(message)s
file_level = INFO
cons_level = DEBUG
cons_format = %%(levelname)-8s | %%(message)s
[Spark]
spark_available = True
machines = 10
cores = 4
script_spark = /export/usuarios_ml4ds/jarenas/script-spark/script-spark
token_spark = /export/usuarios_ml4ds/jarenas/script-spark/tokencluster.json
[Dask]
num_workers = 0
[HDFS]
#This paths are specific to UC3M deployment
Semantic Scholar = /export/ml4ds/IntelComp/Datalake/SemanticScholar/20220201/papers.parquet
PATSTAT = /export/ml4ds/IntelComp/Datalake/PATSTAT/2022_Spring/patstat_appln.parquet
CORDIS = /export/ml4ds/IntelComp/Datalake/CORDIS/20220221/new_parquet/projects.parquet
[Preproc]
#Minimum number of words to keep document in corpus
min_lemas = 15
#Remove words with less than no_below occurrences
no_below=10
#Remove words appearing in more than a given percentage of documents
no_above=0.6
#Maximum number of words in vocabulary
keep_n=500000
[TM]
#Default setting for number of topics
ntopics=25
thetas_thr=3e-3
[MalletTM]
#Path to mallet binary
mallet_path=/export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/src/topicmodeling/mallet-2.0.8/bin/mallet
#Regular expression for token identification
token_regexp=[\p{L}\p{N}][\p{L}\p{N}\p{P}]*\p{L}
#Settings for mallet training and doctopics postprocessing
alpha=5
optimize_interval=10
num_threads=4
num_iterations=1000
doc_topic_thr=0
num_iterations_inf=100
[SparkLDA]
alpha=5
maxIterations=20
#Supported values for optimizer are 'em' and 'online'
optimizer=online
optimizeDocConcentration=True
subsamplingRate=0.05
[ProdLDA]
model_type=prodLDA
hidden_sizes=(100,100)
activation=softplus
dropout=0.2
learn_priors=True
lr=2e-3
momentum=0.99
solver=adam
num_epochs=100
reduce_on_plateau=False
batch_size=64
topic_prior_mean=0.0
topic_prior_variance=None
num_samples=10
num_data_loader_workers=0
[CTM]
model_type=prodLDA
ctm_model_type=CombinedTM
hidden_sizes=(100,100)
activation=softplus
dropout=0.2
learn_priors=True
batch_size=64
lr=2e-3
momentum=0.99
solver=adam
num_epochs=100
num_samples=10
reduce_on_plateau=False
topic_prior_mean=0.0
topic_prior_variance=None
num_data_loader_workers=0
label_size=0
loss_weights=None
sbert_model_to_load=paraphrase-distilroberta-base-v1
contextual_size=768
[bertopic]
no_below=1
no_above=1
get_sims=False
sbert_model=paraphrase-distilroberta-base-v2
umap_n_components=5
umap_n_neighbors=15
umap_min_dist=0.0
umap_metric=cosine
hdbscan_min_cluster_size=10
hdbscan_metric=euclidean
hdbscan_cluster_selection_method=eom
hbdsan_prediction_data=True
[Hierarchical]
expansion_tpc=0
htm_version=htm-ds
thr=0.2