-
Notifications
You must be signed in to change notification settings - Fork 2
/
Statistics_of_different_labels_for_one_document
120 lines (97 loc) · 6.11 KB
/
Statistics_of_different_labels_for_one_document
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
#!/usr/bin/perl -w
########################################################################################################################################################################################
## For a detailed description of this metric and source code, please read: #####
## This code is to statistic the different lables and their corresponding occurrences in the document #####
## This perl code is written by Aaron Li-Feng Han in university of macau, 2013.01. Its written originally for the published paper: #####
## "Phrase Tagset Mapping for French and English Treebanks and Its Application in Machine Translation Evaluation". 2013. Aaron Li-Feng Han, Derek F. Wong, Lidia S. Chao, #####
## Liangye He, Shuo Li, and Ling Zhu. Proceedings of the GSCL 2013, Germany. LNCS Vol. 8105, pp. 119–131. Volume Editors: Iryna Gurevych, Chris Biemann and Torsten Zesch. #####
## Source code website: https://github.com/aaronlifenghan/aaron-project-hppr #####
## Online paper: http://www.springer.com/computer/database+management+%26+information+retrieval/book/978-3-642-40721-5 #####
## All Copyright (c) 2013 by author Aaron Li-Feng Han < hanlifengaaron@gmail.com > #####
########################################################################################################################################################################################
## D:\call paper\Conferences-2\test NLEPOMS\UniDeal_ppos2012\src\newstest2012-src.en\ppos
## Oxford-dic1200.zh.parse
#open (TEST,"<:encoding(utf8)","D:\\call paper\\Conferences-2\\test NLEPOMS\\UniDeal_ppos2012\\src\\newstest2012-src.en\\ppos\\newstest2012-ref.en.tok.parsed.ppos") || die "can not open file: $!";
#open (RESULT,">:encoding(utf8)","D:\\call paper\\Conferences-2\\test NLEPOMS\\UniDeal_ppos2012\\src\\newstest2012-src.en\\ppos\\Statistical_label_wmt2012.ref.en.ppos.txt") || die "$!";
##open (TEST,"<:encoding(utf8)","E:\\Berkeley_Parser\\statistic\\Oxford-dic1200.zh.parse") || die "can not open file: $!";
##open (RESULT,">:encoding(utf8)","E:\\Berkeley_Parser\\statistic\\Oxford-dic1200.zh.parse.stat") || die "$!";
###open (TEST,"<:encoding(utf8)","E:\\Berkeley_Parser\\statistic\\Oxford-dic12000.zh.parse") || die "can not open file: $!";
###open (RESULT,">:encoding(utf8)","E:\\Berkeley_Parser\\statistic\\Oxford-dic12000.zh.parse.stat") || die "$!";
####open (TEST,"<:encoding(utf8)","E:\\Berkeley_Parser\\statistic\\Oxford-dictionary.zh.parse") || die "can not open file: $!";
####open (RESULT,">:encoding(utf8)","E:\\Berkeley_Parser\\statistic\\Oxford-dictionary.zh.parse.stat") || die "$!";
open (TEST,"<:encoding(utf8)","E:\\Berkeley_Parser\\from-Luyi\\english\\test-program.txt") || die "can not open file: $!";
open (RESULT,">:encoding(utf8)","E:\\Berkeley_Parser\\from-Luyi\\english\\test-program.txt.stat") || die "$!";
$i=0;
$str0="";
@arry_1= ();
@arry_sys_length= ();
@arrytwo_sys_translation= ();
$sentence_num=0;
while($str0=<TEST>) #### put the system translation into a two dimension array @arrytwo_sys_translation
{
chomp($str0);
## $str0= lc ($str0); ### both reference and system output translation is turned into lowwer case
@arry_1= split(/\s+/,$str0);
$arry_sys_length[$i]=scalar(@arry_1); #### @arry_sys_length store the lengths of every sentence(line) of the system translation.
$i++;
push @arrytwo_sys_translation, [@arry_1];
@arry_1= ();
}
$sentence_num=$i;
close TEST;
print RESULT 'length of each sentence of sysoutput:',"\n","@arry_sys_length","\n",$sentence_num,"\n";
# for($i=0;$i<$sentence_num;$i++) ##### print out the stored document to have a check whether something lost
# {
# for($j=0;$j<$arry_sys_length[$i];$j++)
# {
# print RESULT "$arrytwo_sys_translation[$i][$j]";
# }
# print RESULT "\n";
# }
@repeat_num=();
$m=0;
@record_label=(); ####@record_label store the name of each label in documents
for($i=0;$i<$sentence_num;$i++) ##### store the repeated number of each label into @repeat_num
{
for($j=0;$j<$arry_sys_length[$i];$j++)
{
$flag=0;
for($count=0;$count<scalar(@record_label);$count++)
{
if($record_label[$count] eq $arrytwo_sys_translation[$i][$j])
{
$repeat_num[$count]++;
$flag=1;
last;
}
}
if($flag==0)
{
$record_label[$m]=$arrytwo_sys_translation[$i][$j]; #### record each new label
$repeat_num[$count]++;
$m++;
}
# #if(!(grep(/^$arrytwo_sys_translation[$i][$j]/, @record_label)))
}
}
print RESULT "\n",'number of different labels in the document:',"$m","\n";
print RESULT 'labels and their statistical repeat numbers:',"\n";
# for($i=0;$i<scalar(@record_label);$i++) #### print out all the labels and their statistical repeat numbers
# {
# print RESULT "$record_label[$i]"," $repeat_num[$i]","\n";
# }
# for($i=0;$i<scalar(@record_label);$i++) #### print out the selected labels (only including alphabet) and their statistical repeat numbers
# {
# if($record_label[$i] =~ m/[A-Za-z]/)
# {
# print RESULT "$record_label[$i]"," $repeat_num[$i]","\n";
# }
# }
for($i=0;$i<scalar(@record_label);$i++) #### print out the selected labels (only including left bracket) and their statistical repeat numbers
{
if($record_label[$i] =~ m/\(/)
{
print RESULT "$record_label[$i]"," $repeat_num[$i]","\n";
}
}
close RESULT;