ParsedURL | Line # 100 | 104 | 37 | 95.2% |
0.95212764
|
No Tests | |||
1 | /* | |
2 | * Copyright (c) 2000-2005, University of Salford | |
3 | * All rights reserved. | |
4 | * | |
5 | * Redistribution and use in source and binary forms, with or without | |
6 | * modification, are permitted provided that the following conditions are met: | |
7 | * | |
8 | * Redistributions of source code must retain the above copyright notice, this | |
9 | * list of conditions and the following disclaimer. | |
10 | * | |
11 | * Redistributions in binary form must reproduce the above copyright notice, | |
12 | * this list of conditions and the following disclaimer in the documentation | |
13 | * and/or other materials provided with the distribution. | |
14 | * | |
15 | * Neither the name of the University of Salford nor the names of its | |
16 | * contributors may be used to endorse or promote products derived from this | |
17 | * software without specific prior written permission. | |
18 | * | |
19 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |
20 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
21 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
22 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |
23 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |
24 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |
25 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |
26 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |
27 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |
28 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | |
29 | * POSSIBILITY OF SUCH DAMAGE. | |
30 | */ | |
31 | /* | |
32 | * Copyright (c) 2006, University of Kent | |
33 | * All rights reserved. | |
34 | * | |
35 | * Redistribution and use in source and binary forms, with or without | |
36 | * modification, are permitted provided that the following conditions are met: | |
37 | * | |
38 | * Redistributions of source code must retain the above copyright notice, this | |
39 | * list of conditions and the following disclaimer. | |
40 | * | |
41 | * Redistributions in binary form must reproduce the above copyright notice, | |
42 | * this list of conditions and the following disclaimer in the documentation | |
43 | * and/or other materials provided with the distribution. | |
44 | * | |
45 | * 1. Neither the name of the University of Kent nor the names of its | |
46 | * contributors may be used to endorse or promote products derived from this | |
47 | * software without specific prior written permission. | |
48 | * | |
49 | * 2. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS | |
50 | * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, | |
51 | * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | |
52 | * PURPOSE ARE DISCLAIMED. | |
53 | * | |
54 | * 3. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |
55 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |
56 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |
57 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |
58 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |
59 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |
60 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | |
61 | * POSSIBILITY OF SUCH DAMAGE. | |
62 | * | |
63 | * 4. YOU AGREE THAT THE EXCLUSIONS IN PARAGRAPHS 2 AND 3 ABOVE ARE REASONABLE | |
64 | * IN THE CIRCUMSTANCES. IN PARTICULAR, YOU ACKNOWLEDGE (1) THAT THIS | |
65 | * SOFTWARE HAS BEEN MADE AVAILABLE TO YOU FREE OF CHARGE, (2) THAT THIS | |
66 | * SOFTWARE IS NOT "PRODUCT" QUALITY, BUT HAS BEEN PRODUCED BY A RESEARCH | |
67 | * GROUP WHO DESIRE TO MAKE THIS SOFTWARE FREELY AVAILABLE TO PEOPLE WHO WISH | |
68 | * TO USE IT, AND (3) THAT BECAUSE THIS SOFTWARE IS NOT OF "PRODUCT" QUALITY | |
69 | * IT IS INEVITABLE THAT THERE WILL BE BUGS AND ERRORS, AND POSSIBLY MORE | |
70 | * SERIOUS FAULTS, IN THIS SOFTWARE. | |
71 | * | |
72 | * 5. This license is governed, except to the extent that local laws | |
73 | * necessarily apply, by the laws of England and Wales. | |
74 | */ | |
75 | ||
76 | ||
77 | package issrg.utils; | |
78 | ||
79 | import java.util.Vector; | |
80 | ||
81 | /** | |
82 | * This class provides methods for splitting a URL into an array of strings. | |
83 | * There is also a method for checking that particular strings conform to | |
84 | * the standard - i.e. contain only valid characters. | |
85 | * | |
86 | * <p>It handles only HTTP-like URLs: | |
87 | * | |
88 | * <p><code>protocol : // [[username [: password] @] host [:port]] [/ [ path ] [# anchor] [? query]]</code> | |
89 | * | |
90 | * <p>Note that host is also an optional part of the URL, so file: URLs are | |
91 | * also acceptable (but | |
92 | * getHost() will return null). | |
93 | * | |
94 | * <p>There are corresponding methods to get these values; they can be null, if | |
95 | * the value is missing (only path is not null, and has 0 elements, if missing; | |
96 | * paths ending with '/' have an empty String "" at the end of the array). | |
97 | * | |
98 | * @author A.Otenko | |
99 | */ | |
100 | public class ParsedURL { | |
101 | private String url; | |
102 | private String protocol; | |
103 | private String userName; | |
104 | private String password; | |
105 | private String host; | |
106 | private String port; | |
107 | private String [] path; | |
108 | private String anchor; | |
109 | private String query; | |
110 | ||
111 | String [] normalisedPath; | |
112 | String normalisedURL; | |
113 | String pathString; | |
114 | ||
115 | 0 | protected ParsedURL(){} |
116 | ||
117 | /** | |
118 | * This constructor builds a ParsedURL given the original URL and parts of it. | |
119 | * Any part can be null, except the path. | |
120 | * | |
121 | * @param url - the original URL from which the parts were obtained | |
122 | * @param protocol - the protocol extracted from the original URL | |
123 | * @param userName - the user name as it appears in the URL | |
124 | * @param password - the password as it appears in the URL | |
125 | * @param host - the host name as it appears in the URL | |
126 | * @param port - the port specification String; may be an integer, but | |
127 | * sometimes it is more than that (e.g. a range of ports) | |
128 | * @param path - the array of path elements; cannot be null, but can be empty | |
129 | * @param anchor - the anchor String (everything after "#" and before the | |
130 | * query String) | |
131 | * @param query - the query String (everything after "?") | |
132 | */ | |
133 | 127 | protected ParsedURL(String url, String protocol, String userName, String password, |
134 | String host, String port, String [] path, String anchor, String query){ | |
135 | 127 | this.url=url; |
136 | 127 | this.protocol=protocol; |
137 | 127 | this.userName=userName; |
138 | 127 | this.password=password; |
139 | 127 | this.host=host; |
140 | 127 | this.port=port; |
141 | 127 | this.path=path; |
142 | 127 | this.anchor=anchor; |
143 | 127 | this.query=query; |
144 | ||
145 | 127 | java.util.Vector v=new java.util.Vector(); |
146 | 304 | for (int i=0; i<path.length; i++){ |
147 | 177 | String pathElement = path[i].intern(); |
148 | 177 | if (pathElement=="." && i<path.length-1) continue; // skip lonely ".", don't skip the trailing "." |
149 | 169 | if (pathElement=="..") { // remove the last element only if there are any more elements left |
150 | 3 | if (v.size()>0) v.remove(v.size()-1); // remove the last path element |
151 | 3 | continue; |
152 | } | |
153 | // we are here only if path element is not a "." or ".." | |
154 | ||
155 | 166 | v.add(pathElement); |
156 | } | |
157 | ||
158 | // now v has the normalised path | |
159 | 127 | normalisedPath = (String [])v.toArray(new String[0]); |
160 | 127 | StringBuffer sb = new StringBuffer(); |
161 | 291 | for (int i=0; i<normalisedPath.length; i++){ |
162 | 164 | sb.append("/"+normalisedPath[i]); |
163 | } | |
164 | 127 | pathString=sb.toString(); |
165 | ||
166 | 127 | normalisedURL = getProtocol()+"://"+ |
167 | 127 | (getHost()==null? |
168 | "": | |
169 | 117 | ((getUserName()==null? // should I check if Host is specified first? |
170 | "": | |
171 | (getUserName()+ | |
172 | 3 | (getPassword()==null? |
173 | "": | |
174 | (":"+getPassword()) | |
175 | )+"@" | |
176 | ) | |
177 | )+ | |
178 | (getHost()+ | |
179 | 117 | (getPort()==null? |
180 | "": | |
181 | (":"+getPort()) | |
182 | ) | |
183 | ) | |
184 | ) | |
185 | )+ | |
186 | pathString+ // normalised Path is here | |
187 | 127 | (getAnchor()==null? "" : ("#"+getAnchor()))+ |
188 | 127 | (getQuery()==null? "" : ("?"+getQuery())) |
189 | ; | |
190 | } | |
191 | ||
192 | /** | |
193 | * @return the protocol of the URL as it has been provided to the constructor | |
194 | */ | |
195 | 1541 | public String getProtocol(){ |
196 | 1541 | return protocol; |
197 | } | |
198 | ||
199 | /** | |
200 | * @return the original URL as it has been provided to the constructor | |
201 | */ | |
202 | 0 | public String getURL(){ |
203 | 0 | return url; |
204 | } | |
205 | ||
206 | /** | |
207 | * @return the user name as it has been provided to the constructor | |
208 | */ | |
209 | 1924 | public String getUserName(){ |
210 | 1924 | return userName; |
211 | } | |
212 | ||
213 | /** | |
214 | * @return the password as it has been provided to the constructor | |
215 | */ | |
216 | 1809 | public String getPassword(){ |
217 | 1809 | return password; |
218 | } | |
219 | ||
220 | /** | |
221 | * @return the host name as it has been provided to the constructor | |
222 | */ | |
223 | 2743 | public String getHost(){ |
224 | 2743 | return host; |
225 | } | |
226 | ||
227 | /** | |
228 | * @return the port string as it has been provided to the constructor | |
229 | */ | |
230 | 242 | public String getPort(){ |
231 | 242 | return port; |
232 | } | |
233 | ||
234 | /** | |
235 | * This method returns normalised path (excessive "." and ".." are removed) | |
236 | * | |
237 | * @return array of strings, representing the path; no "." or ".." are there, | |
238 | * only the last element may be a "." if the URL ends with a "/" and means | |
239 | * that the previous name in the path is a directory | |
240 | */ | |
241 | 916 | public String [] getPath(){ |
242 | 916 | return normalisedPath; |
243 | } | |
244 | ||
245 | /** | |
246 | * This method returns the path as it is in the URL ("." and ".." are | |
247 | * possible). | |
248 | * | |
249 | * @return array of strings, representing the path; if no excessive "." or ".." | |
250 | * were used, it is the same as getPath() | |
251 | */ | |
252 | 0 | public String [] getOriginalPath(){ |
253 | 0 | return path; |
254 | } | |
255 | ||
256 | /** | |
257 | * This method returns the normalised path as a String. | |
258 | */ | |
259 | 447 | public String getPathString(){ |
260 | 447 | return pathString; |
261 | } | |
262 | ||
263 | /** | |
264 | * @return the anchor string as it has been provided to the constructor | |
265 | */ | |
266 | 139 | public String getAnchor(){ |
267 | 139 | return anchor; |
268 | } | |
269 | ||
270 | /** | |
271 | * @return the query string as it has been provided to the constructor | |
272 | */ | |
273 | 147 | public String getQuery(){ |
274 | 147 | return query; |
275 | } | |
276 | ||
277 | /** | |
278 | * This method returns a normalised URL (i.e. the path is without '.' and | |
279 | * '..' elements, etc.) | |
280 | */ | |
281 | 554 | public String getNormalizedURL(){ |
282 | 554 | return normalisedURL; |
283 | } | |
284 | ||
285 | /** | |
286 | * This method parses a URL string, and returns a ParsedURL object, if | |
287 | * succeded. | |
288 | * It returns null, if URL is not valid. | |
289 | * | |
290 | * <p>Valid URLs correspond to the following syntax: | |
291 | * <p><code>[url:]protocol : // [username [: password]@] host [: port] [/ [path] [# [anchor]]] [? [query]]</code> | |
292 | * | |
293 | * @param url is a string encoding of the URL; no character transformation is | |
294 | * done, e.g. %20 remains itself, and is not substituted by a space | |
295 | * | |
296 | * @return ParsedURL object, if parse succeeded, or null, if parse failed. | |
297 | */ | |
298 | 132 | public static ParsedURL parseURL(String url){ |
299 | 132 | String protocol=null, userName=null, password=null, host=null, port=null, |
300 | path[]=new String[0], anchor=null, query=null; | |
301 | ||
302 | 132 | String u=url; |
303 | ||
304 | // u may be null; it's ok, r will be an array of two null strings in a moment | |
305 | ||
306 | 132 | String [] r=split(u, "://"); // find url:protocol and the rest of the URL |
307 | 132 | if (r[0]==null){ // not a URL |
308 | 0 | url=null; |
309 | } else { | |
310 | 132 | u=r[1]; |
311 | 132 | r=split(r[0], ":"); // r[0] will become "url" or protocol or null (invalid URL) |
312 | // r[1] will become protocol or null | |
313 | 132 | if (r[1]==null){ |
314 | 130 | protocol=r[0]; |
315 | } else { | |
316 | 2 | protocol=r[1]; |
317 | 2 | if (r[0]==null || r[0].compareToIgnoreCase("url")!=0){ |
318 | 1 | url=null; |
319 | } | |
320 | } | |
321 | } | |
322 | // ok, now we've got the protocol, and perhaps url is not null (if everything was ok) | |
323 | // the rest of the URL is in u | |
324 | ||
325 | 132 | if (u!=null && url!=null){ |
326 | 122 | int j=u.indexOf('#'); |
327 | 122 | int k=u.indexOf('?'); |
328 | ||
329 | 122 | if (k<0 || (j>=0 && j<k)) k=j; // if there was no '?', or '#' anchor exists and is before '?' query, then anchor '#' defines the length of the meaningful URL |
330 | ||
331 | 122 | if (k>0){ // aha, there was either anchor '#' or query '?' (or both) |
332 | 7 | String tail = u.substring(k); // k identifies the position of '#' or '?', whichever is earlier |
333 | ||
334 | 7 | r=split(tail, "?"); // now r[0] is anchor, if any, or null, if no anchor |
335 | // r[1] is query, if any, or null, if no query | |
336 | ||
337 | 7 | if (r[0]!=null && !r[0].equals("#")){ // if anchor is present, it always starts with '#'; but if it is equal to '#', then there is no anchor |
338 | 3 | anchor=r[0].substring(1); // skip '#' |
339 | } | |
340 | ||
341 | 7 | query=r[1]; |
342 | ||
343 | 7 | u=u.substring(0, k); |
344 | } | |
345 | // ok, now if there were any query or anchor, they were extracted | |
346 | ||
347 | // now u has the user:psw@host:port/path bit | |
348 | ||
349 | 122 | r=split(u, "/"); // now r[0] is host definition, or null (no host in the URL - must be a file: URL) |
350 | // r[1] is path definition, or null (no path) | |
351 | // if r[1]==null, it may still have had '/' at the end - a path of one level, with empty filename | |
352 | // should check that there should always be some path (at least singular '/' at the end) if anchor is not null | |
353 | ||
354 | 122 | String p=r[1]; |
355 | 122 | if (p==null){ |
356 | 22 | if (u.endsWith("/")){ // oh, yes, there was this single trailing '/' |
357 | 11 | p=""; |
358 | } | |
359 | } | |
360 | ||
361 | 122 | if (r[0]!=null){ // r[0]!=null - host is present; r[0]==null => u starts with '/' (file: URL) |
362 | // parse r[0], as if it were a host definition | |
363 | ||
364 | 121 | r=split(r[0], "@"); // now r[0] is username:password, or hostname:port, or null (invalid URL: '@' was present, but nothing in front of it) |
365 | // r[1] is hostname:port, or null | |
366 | ||
367 | 121 | if (r[1]==null){ |
368 | // r[0] == hostname:port | |
369 | 116 | u=r[0]; // could be null, but this will generate error when splitting hostname and port |
370 | } else { | |
371 | // r[1] == hostname:port, r[0] == username:password | |
372 | 5 | u=r[1]; |
373 | ||
374 | 5 | if (r[0]==null || r[0].endsWith(":")){ // username:password cannot end with ":", and cannot be null ('@' is present) |
375 | 1 | url=null; |
376 | } else { | |
377 | 4 | r=split(r[0], ":"); // now r[0] is username, or null (invalid URL) |
378 | // r[1] is password, or null (check has been done if the username:password ends with ':' - 'username:' is an invalid combination) | |
379 | ||
380 | 4 | userName=r[0]; |
381 | 4 | password=r[1]; |
382 | ||
383 | 4 | if (userName==null){ // username:password starts with ":" - bad URL |
384 | 1 | url=null; |
385 | } | |
386 | } | |
387 | } | |
388 | // ok, now username and password have been extracted; | |
389 | // u is hostname:port | |
390 | ||
391 | 121 | r=split(u, ":"); // now r[0] is host, or null (invalid URL) |
392 | // r[1] is port, or null | |
393 | // if r[1]==null, and u ends with ":" - invalid URL | |
394 | ||
395 | 121 | if (r[0]==null || (r[1]==null && u.endsWith(":"))) { |
396 | 1 | url=null; |
397 | } else { | |
398 | 120 | host=r[0]; |
399 | 120 | port=r[1]; // note that no checks if port is a number: here we only split the string into components |
400 | } | |
401 | } | |
402 | // ok, now parsed username:password@host:port | |
403 | ||
404 | // now check if p==path | |
405 | 122 | if (p!=null){ |
406 | // there is no initial '/' in p | |
407 | 111 | p+="/"; // this is an artificial improvement to allow a simple loop deal with the path components: every path component ends with a '/' |
408 | // this will produce a final iteration that would result | |
409 | // in split(p, "/") == {lastPathComponent, null} | |
410 | ||
411 | 111 | Vector pathElements = new Vector(); |
412 | ||
413 | 291 | while (p!=null) { // at least one iteration should work |
414 | 180 | r=split(p, "/"); |
415 | 180 | if (r[0]==null) r[0]="."; // two subsequent slashes "//" are treated as "/./" |
416 | ||
417 | 180 | pathElements.add(r[0]); |
418 | 180 | p=r[1]; |
419 | } | |
420 | // last slash has been removed | |
421 | ||
422 | 111 | path = (String [])pathElements.toArray(new String[0]); |
423 | } | |
424 | } | |
425 | ||
426 | 132 | if (url==null || (path.length==0 && anchor!=null)) return null; // not a valid URL |
427 | ||
428 | 127 | return new ParsedURL(url, protocol, userName, password, host, port, path, anchor, query); |
429 | } | |
430 | ||
431 | /** | |
432 | * This is a utility method that splits the string into two substrings, having | |
433 | * found a c string | |
434 | * in it. The first substring is the string before the first occurence of c, | |
435 | * the second substring is the string after the first occurence of c. | |
436 | * | |
437 | * <p>If the substrings are empty, they will be null. | |
438 | * | |
439 | * <p>E.g. split("@", "@") == {null, null} | |
440 | * <br>split("://path", "://") == {null, "path"} | |
441 | * <br>split("http://host", "://") == {"http", "host"} | |
442 | */ | |
443 | 819 | private static String [] split(String u, String c){ |
444 | 819 | String [] r = new String[2]; |
445 | ||
446 | 819 | if (u!=null){ |
447 | 819 | int j=u.indexOf(c); |
448 | 819 | if (j<0){ |
449 | 383 | r[0]=u.length()==0? null: u; |
450 | } else { | |
451 | 436 | r[0]=j==0?null:u.substring(0, j); |
452 | 436 | r[1]=(j+c.length())<u.length()? u.substring(j+c.length()) : null; |
453 | } | |
454 | } | |
455 | ||
456 | 819 | return r; |
457 | } | |
458 | } |
|