-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawler.conf
81 lines (62 loc) · 1.36 KB
/
crawler.conf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
#connection parameters
fileType=text/html,text/plain
timeout=9000
soTimeout=9000
connMgrTimeout=9000
wait=0
maxFileSize=300000
usrAgent=TRSFC Crawler (Research purpose crawler)
usrFrom=tanaphol - at - akane.waseda.jp
#general parameter
maxPagePerSite=300
isOnlyStaticURL=true
ignoreRobot=true
canonicalCount=3
allowHttps=true
maxDepth=-1
pageMode=true
#limit downloaded page
limitDownloadedPages=20000
#crawler parameters
relevanceDegree=0.5
segmentThreshold=3
distanceFromRelevantSeg=1
filterNon=false
windowSize=100
#page classifier
checkerType=weka
targetLang=en
pageClassifierModel=/resources/classifiers/page-tourism.model
#pageClassifierModel=page-tourism.arff
relevantKeywordsPath=/resources/classifiers/thaiwords.txt
#predictor
predictorTrainingPath=
updateInterval=-100
isTrainingMode=false
useNeighborhood=false
useHistory=false
useDup=false
#linkClassifierAlgo=weka.classifiers.lazy.IBk
#linkClassifierParams=-K 5 -W 0
linkClassifierAlgo=weka.classifiers.bayes.NaiveBayesUpdateable
linkClassifierParams=
weightClassifiers=1,1,1,1
#tourism k5
#weightClassifiers=0.888,0.846,0.855
#tourism k3
#weightClassifiers=0.8,0.716,0.634
#tourism k5
#weightClassifiers=0.898,0.806,0.832
#thread
threads=1
#seed
seedPath=test.txt
#proxy
localProxyPath=
proxyServer=
proxyPort=
#db/dl parameters
downloadPath=logs/dl-tmp/
dbPath=db-tmp/
isGzip=true
collectDestinationUrl=false