-
Notifications
You must be signed in to change notification settings - Fork 0
/
htmlParsing.java
260 lines (223 loc) · 11.5 KB
/
htmlParsing.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
import java.io.*;
import java.util.*;
import java.lang.*;
import java.net.*;
/* YOU SHOULD NOT NEED TO LOOK AT THIS CODE AT ALL.
BUT IT COULD BE INTERESTING FOR YOU TO SEE HOW IT WORKS. */
// This class implements a simple parser for html documents
public class htmlParsing {
static String internetFilesLocation="internetFiles";
static HashMap<String,String > queriedURL_links=new HashMap<String,String >();
static HashMap<String,String > queriedURL_content=new HashMap<String, String >();
// In Windows, change the line below to static String directoryChar="\\";
static String directoryChar = "/";
/* getLinks returns a LinkedList of Strings corresponding containing all
the links out of a given url. Only the the www.cs.mcgill.ca domain is
considered. Personnal web pages (those with URL containing "~")
are not considered. Only html links are considered. The given url
must contain the full path (implicit "index.html" are not allowed).
So "http://www.cs.mcgill.ca" is not a valid URL, but "http://www.cs.mcgill.ca/index.html"
is. */
static public LinkedList<String> getLinks(String url) throws Exception {
BufferedReader myURL;
LinkedList<String> ret = new LinkedList<String>(); // the list of URLs to be returned
String domain;
// url=fixURL(url).toLowerCase();
boolean isMainPage=url.equals("http://www.cs.mcgill.ca");
if (queriedURL_links.containsKey(url)) throw new Exception("Tried to query the following URL twice: "+url);
queriedURL_links.put(url,"1");
try {
URLConnection Conn;
URL u= new URL(url);
Conn = u.openConnection();
// System.out.println("Opening "+url);
myURL = new BufferedReader(new InputStreamReader(Conn.getInputStream()));
// myURL = new BufferedReader( new FileReader( filename ) );
}
catch (Exception e) {
System.out.println( "Error opening URL:" + url + "\n" + e);
return new LinkedList<String>();
}
// get the domain name
if ( url.indexOf( ".html" )!=-1 || url.indexOf( ".php" ) != -1) domain = url.substring( 0 , url.lastIndexOf( '/' ) );
else domain = url;
while (domain.substring(domain.length()-1,domain.length()).equals("/")) domain=domain.substring(0,domain.lastIndexOf('/'));
String line = null;
// read each line of the file
boolean inMenu=true;
do {
try {
line = myURL.readLine();
}
catch (Exception e) {continue;}
// get the links in the current line
LinkedList<String> n = parseLine( line , domain );
if (isMainPage || !inMenu) ret.addAll( n );
if (line!=null && line.indexOf("panel-body")!=-1) inMenu=false;
} while (line!=null);
try {
myURL.close();
} catch (Exception e) {};
return ret;
} // end of getLinks
/* Returns a LinkedList of Strings corresponding containing all
the words at a given url. Only the the www.cs.mcgill.ca domain is
considered. Personnal web pages (those with URL containing "~")
are not considered. Only html links are considered. The given url
must contain the full path (implicit "index.html" are not allowed).
So "http://www.cs.mcgill.ca" is not a valid URL, but "http://www.cs.mcgill.ca/index.html"
is. Words inside html tags are not considered. */
static public LinkedList<String> getContent(String url) throws Exception {
// url = fixURL( url ).toLowerCase();
// String filename = internetFilesLocation + directoryChar + url.replace(':','+').replace('/','+');
BufferedReader myURL;
if (queriedURL_content.containsKey(url)) throw new Exception("Tried to query the following URL twice: "+url);
queriedURL_content.put(url,"1");
try {
URLConnection Conn;
URL u= new URL(url);
Conn = u.openConnection();
// System.out.println("Opening "+url);
myURL = new BufferedReader(new InputStreamReader(Conn.getInputStream()));
// myURL = new BufferedReader( new FileReader(filename) );
}
catch (Exception e) {
System.out.println("Error opening URL: " + url + "\n" + e);
return new LinkedList<String>();
}
String domain;
if (url.charAt(0)=='/') domain=new String("http://www.cs.mcgill.ca/");
if (url.indexOf(".html")!=-1) domain= url.substring(0,url.lastIndexOf('/'));
else domain=url;
while (domain.substring(domain.length()-1,domain.length()).equals("/")) domain=domain.substring(0,domain.lastIndexOf('/'));
// System.out.println("doamin = "+domain);
String line=null;
// this string will contain the whole file on one line
String wholeFile ="";
do {
try { line = myURL.readLine(); }
catch ( Exception e ) { continue; }
if ( line!=null ) wholeFile = wholeFile + " " + line;
} while ( line!=null );
try {
myURL.close();
} catch (Exception e) {};
// now parse wholeFile, removing anything between < and >
StringTokenizer st = new StringTokenizer( wholeFile , "<>~`!@#$%^&*(),.?;:[]{}+=-_\\|\" ",true);
LinkedList<String> ret = new LinkedList<String>();
boolean insideTag = false;
while ( st.hasMoreTokens() ) {
String next = st.nextToken();
if ( next.equals("<") ) { insideTag = true; continue;}
if ( next.equals(">") ) { insideTag = false; continue;}
if (insideTag) continue;
if ( Character.isLetterOrDigit( next.charAt(0) )) ret.addLast( next.toLowerCase() );
}
return ret;
} // end of getContent
// Returns a fixed URL where /index.html is added if necessary
// You should not need to use this method
static public String fixURL(String url) {
// replace %7e by ~
int i=url.indexOf("%7e");
if (i==-1) i=url.indexOf("%7E");
if (i!=-1) url=url.substring(0,i)+"~"+url.substring(i+3,url.length());
if (url.charAt(url.length()-1)=='/') url=url.substring(0,url.length()-1);
/*
if ( url.indexOf(".html") == -1 ) {
if ( url.charAt(url.length() - 1 ) == '/' ) url = url + "index.html";
else url = url + "/index.html";
}
*/
return url;
//int i = url.lastIndexOf("//");
//if (i==5) return url;
//else return url.substring(0,i)+url.substring(i+1,url.length());
} //end of fixURL
// Returns a LinkedList of strings contained in a line
// You should not need to use this method
static public LinkedList<String> getLineContent(String line) {
if ( line == null ) return new LinkedList<String>();
StringTokenizer st = new StringTokenizer( line , " " , false);
LinkedList<String> ret = new LinkedList<String>();
while ( st.hasMoreTokens() ) ret.addLast( st.nextToken() );
return ret;
} // end getLineContent
// Returns a LinkedList of URLs contained in the line.
// If only partial URLs is present, domain is prepended
// You should not need to use this method
static public LinkedList<String> parseLine(String line, String domain) {
String key = "a href=\"" ;
LinkedList<String> ret = new LinkedList<String> ();
// hacf to prevent navigational links to count as real links.
if ( line == null) return ret;
// if (line.indexOf("<li>")!=-1) return ret; // this prevents lines corresponding to the domain map to be included.
//line = line.toLowerCase();
int position = line.indexOf(key);
if (position == -1 || line.length() < key.length() ) return ret;
String s = line.substring(position+key.length());
int end = s.indexOf("\"");
if ( end==-1 ) return ret;
String u = s.substring( 0 , end );
// System.out.println(u);
// Remove URLS that are not html files.
if (( domain.indexOf(u)!=-1 || u.indexOf("javascript")!=-1 || u.indexOf("https")!=-1 || u.indexOf("Members")!=-1 || u.indexOf(".doc")!=-1 || u.indexOf("reservations")!=-1 || u.indexOf("announcements")!=-1 || u.indexOf("registration")!=-1 || u.indexOf("login_form")!=-1 || u.indexOf("mailto")!=-1 || u.indexOf("#")!=-1 || u.indexOf("jette")!=-1 || u.indexOf("bias")!=-1 || u.indexOf(".cgi")!=-1|| u.indexOf(".mp3")!=-1 || u.indexOf(".ppt")!=-1|| u.indexOf(".pl")!=-1|| u.indexOf(".jpg")!=-1|| u.indexOf(".dmg")!=-1|| u.indexOf(".mso")!=-1|| u.indexOf(".xml")!=-1|| u.indexOf(".css")!=-1|| u.indexOf("internal")!=-1|| u.indexOf(".java")!=-1|| u.indexOf(".png")!=-1|| u.indexOf("<li>")!=-1 ||
u.indexOf(".exe")!=-1 || u.indexOf(".aft")!=-1 || u.indexOf(".pdf")!=-1 ||u.indexOf(".ico")!=-1 || u.indexOf(".ps")!=-1 || u.indexOf("?")!=-1 || u.indexOf("credits")!=-1 || u.indexOf("..")!=-1 || u.indexOf("./")!=-1 || u.indexOf("@")!=-1 || u.indexOf("tar")!=-1 || u.indexOf("gz")!=-1 || u.indexOf("zip")!=-1 || u.indexOf("txt")!=-1 || u.indexOf("hqx")!=-1 || u.indexOf("mws")!=-1 || u.indexOf("finak")!=-1 || u.indexOf("nex")!=-1 || u.indexOf("docs")!=-1))
return ret;
if (u.length()>0 && u.charAt(0)=='/') u = "http://www.cs.mcgill.ca/"+u;
else if ( u.indexOf("http:") == -1) u = domain + "/" + u;
// replace // by / and remove / and the end
int flag;
flag=0;
// System.out.println("Fixing "+u);
do {
flag=u.indexOf("//",7);
if (flag!=-1) u=u.substring(0,flag)+u.substring(flag+1,u.length());
} while (flag!=-1);
if (u.charAt(u.length()-1)=='/') u=u.substring(0,u.length()-1);
// System.out.println("becomes "+u);
if (u.indexOf(" ")==-1 && u.indexOf("http://www.cs.mcgill.ca") != -1 ) {
u = fixURL(u);
ret.addLast(u);
}
return ret;
} // end of parseLine
// write the content of a URL to a file.
// You should not need to use this method
static public void writeContent(String url) {
url=url.toLowerCase();
String fileurl="internetFiles"+directoryChar+url.replace('/','+').replace(':','+');
BufferedReader myURL;
URLConnection Conn;
BufferedWriter out=null;
try {
out = new BufferedWriter(new FileWriter(fileurl));
} catch (Exception e) {System.out.println("e0"+e);}
try {
URL u= new URL(url);
Conn = u.openConnection();
myURL = new BufferedReader(new InputStreamReader(Conn.getInputStream()));
}
catch (Exception e) { System.out.println("e1+"+e);return;}
System.out.println("WRITING TO "+fileurl);
String domain;
if (url.charAt(0)=='/') domain="http://www.cs.mcgill.ca";
else if (url.indexOf(".html")!=-1) domain= url.substring(0,url.lastIndexOf('/'));
else domain=url;
String line=null;
String wholeFile ="";
do {
try {
line=myURL.readLine();
if (line!=null) out.write(line+"\n",0,line.length()+1);
}
catch (Exception e) {System.out.println("e2+"+e);continue;}
if (line!=null) wholeFile=wholeFile+"\n"+line;
} while (line!=null);
try {
myURL.close();
out.close();
} catch (Exception e) {System.out.println("e3+"+e);};
System.out.println("Done writing");
}
} // end of writeContent