Name | Executed | Routines | % | Executed | Lines | % | Unexecuted |
/home/matt/eu/rds/include/std/net/url.e | 4 | 4 | 100.00% | 119 | 129 | 92.25% | 10 |
Routine | Executed | Lines | Unexecuted | |
parse() | 56 | 61 | 91.80% | 5 |
decode() | 14 | 18 | 77.78% | 4 |
parse_querystring() | 29 | 30 | 96.67% | 1 |
encode() | 15 | 15 | 100.00% | 0 |
# | Executed | |
1 | --**** | |
2 | -- == URL handling | |
3 | -- | |
4 | ||
5 | namespace url | |
6 | ||
7 | include std/get.e | |
8 | include std/map.e | |
9 | ||
10 | --**** | |
11 | -- === Parsing | |
12 | -- | |
13 | ||
14 | 2 | constant PAIR_SEP = {'&', ';'}, HEX_SIG = '%', WHITESPACE = '+', VALUE_SEP = '=' |
15 | ||
16 | --** | |
17 | -- Parse a query string into a map | |
18 | -- | |
19 | -- Parameters: | |
20 | -- # ##query_string##: Query string to parse | |
21 | -- | |
22 | -- Returns: | |
23 | -- [[:map]] containing the key/value pairs | |
24 | -- | |
25 | -- Example 1: | |
26 | -- | |
27 | -- map qs = parse_querystring("name=John&age=18") | |
28 | -- printf(1, "%s is %s years old\n", { map:get(qs, "name"), map:get(qs, "age) }) | |
29 | -- | |
30 | -- | |
31 | ||
32 | 2 | |
33 | atom i, char | |
34 | object tmp | |
35 | 2 | sequence charbuf, fieldbuf, fname="" |
36 | 2 | map:map the_map = map:new() |
37 | ||
38 | 2 | if atom(query_string) then |
39 | 0 | return the_map |
40 | end if | |
41 | ||
42 | 2 | charbuf = {} fieldbuf = {} i = 1 |
43 | 2 | while i <= length(query_string) do |
44 | 57 | char = query_string[i] -- character we're working on |
45 | 57 | if equal(char, HEX_SIG) then |
46 | 1 | tmp = value("#" & query_string[i+1] & query_string[i+2]) |
47 | 1 | charbuf &= tmp[2] |
48 | 1 | i += 3 |
49 | 56 | elsif equal(char, WHITESPACE) then |
50 | 1 | charbuf &= " " |
51 | 1 | i += 1 |
52 | 55 | elsif equal(char, VALUE_SEP) then |
53 | 5 | fname = charbuf |
54 | 5 | charbuf = {} |
55 | 5 | i += 1 |
56 | 50 | elsif find(char, PAIR_SEP) then |
57 | 3 | map:put(the_map, fname, charbuf) |
58 | 3 | fname = {} |
59 | 3 | charbuf = {} |
60 | 3 | i += 1 |
61 | else | |
62 | 47 | charbuf &= char |
63 | 47 | i += 1 |
64 | end if | |
65 | 57 | end while |
66 | ||
67 | 2 | if length(fname) then |
68 | 2 | map:put(the_map, fname, charbuf) |
69 | end if | |
70 | ||
71 | 2 | return the_map |
72 | end function | |
73 | ||
74 | 2 | public enum URL_PROTOCOL, URL_HOSTNAME, URL_PORT, URL_PATH, URL_USER, URL_PASSWORD, |
75 | 2 | URL_QUERY_STRING |
76 | ||
77 | --** | |
78 | -- Parse a URL returning its various elements. | |
79 | -- | |
80 | -- Parameters: | |
81 | -- # ##url##: URL to parse | |
82 | -- # ##querystring_also##: Parse the query string into a map also? | |
83 | -- | |
84 | -- Returns: | |
85 | -- A multi-element sequence containing: | |
86 | -- # protocol | |
87 | -- # host name | |
88 | -- # port | |
89 | -- # path | |
90 | -- # user name | |
91 | -- # password | |
92 | -- # query string | |
93 | -- | |
94 | -- Or, zero if the URL could not be parsed. | |
95 | -- | |
96 | -- Notes: | |
97 | -- If the host name, port, path, username, password or query string are not part of the | |
98 | -- URL they will be returned as an integer value of zero. | |
99 | -- | |
100 | -- Example 1: | |
101 | -- | |
102 | -- sequence parsed = parse("http://user:pass@www.debian.org:80/index.html?name=John&age=39") | |
103 | -- -- parsed is | |
104 | -- -- { | |
105 | -- -- "http", | |
106 | -- -- "www.debian.org", | |
107 | -- -- 80, | |
108 | -- -- "/index.html", | |
109 | -- -- "user", | |
110 | -- -- "pass", | |
111 | -- -- "name=John&age=39" | |
112 | -- -- } | |
113 | -- | |
114 | -- | |
115 | ||
116 | 9 | |
117 | 9 | sequence protocol = "" |
118 | object host_name, path, user_name, password, query_string | |
119 | integer port | |
120 | ||
121 | -- Set the defaults for some optional values | |
122 | 9 | host_name = 0 |
123 | 9 | port = 0 |
124 | 9 | path = 0 |
125 | 9 | user_name = 0 |
126 | 9 | password = 0 |
127 | 9 | query_string = 0 |
128 | ||
129 | 9 | integer pos = find(':', url) |
130 | 9 | if not pos then |
131 | 0 | return 0 |
132 | end if | |
133 | ||
134 | 9 | protocol = url[1..pos - 1] |
135 | 9 | pos += 1 |
136 | ||
137 | -- Can have a maximum of 2 // before we move into the hostname or possibly | |
138 | -- the path (http://john.com) or (file:///home/jeremy/hello.txt) | |
139 | 9 | if url[pos] = '/' then |
140 | 8 | pos += 1 |
141 | end if | |
142 | 9 | if url[pos] = '/' then |
143 | 8 | pos += 1 |
144 | end if | |
145 | 9 | if url[pos] = '/' then |
146 | -- We do not have a username, password, host or port, we have moved right into | |
147 | -- the path area of the URL. Let's jump ahead | |
148 | 0 | goto "parse_path" |
149 | end if | |
150 | ||
151 | 9 | integer at = find('@', url) |
152 | 9 | if not at then |
153 | -- We do not have a user or password, skip ahead to parsing the domain | |
154 | 6 | goto "parse_domain" |
155 | end if | |
156 | ||
157 | 3 | integer password_colon = find(':', url, pos) |
158 | 3 | if password_colon > 0 and password_colon < at then |
159 | -- We have a password too! | |
160 | 1 | user_name = url[pos..password_colon-1] |
161 | 1 | password = url[password_colon+1..at-1] |
162 | else | |
163 | -- Just a user name | |
164 | 2 | user_name = url[pos..at-1] |
165 | end if | |
166 | ||
167 | 3 | pos = at + 1 |
168 | ||
169 | label "parse_domain" | |
170 | ||
171 | 9 | integer qs_start = find('?', url, pos) |
172 | 9 | integer first_slash = find('/', url, pos) |
173 | 9 | integer port_colon = find(':', url, pos) |
174 | ||
175 | 9 | if port_colon then |
176 | -- We can easily read the host until the port colon | |
177 | 3 | host_name = url[pos..port_colon-1] |
178 | else | |
179 | -- Gotta go through a bit more complex way of getting the path | |
180 | 6 | if not first_slash then |
181 | -- there is no path, thus we must parse to either the query string begin | |
182 | -- or the string end | |
183 | 4 | if not qs_start then |
184 | 2 | host_name = url[pos..$] |
185 | else | |
186 | 2 | host_name = url[pos..qs_start-1] |
187 | end if | |
188 | else | |
189 | -- Ok, we can read up to the first slash | |
190 | 2 | host_name = url[pos..first_slash-1] |
191 | end if | |
192 | end if | |
193 | ||
194 | 9 | if port_colon then |
195 | 3 | integer port_end = 0 |
196 | ||
197 | 3 | if first_slash then |
198 | 3 | port_end = first_slash - 1 |
199 | 0 | elsif qs_start then |
200 | 0 | port_end = qs_start - 1 |
201 | else | |
202 | 0 | port_end = length(url) |
203 | end if | |
204 | ||
205 | 3 | port = defaulted_value(url[port_colon+1..port_end], 0) |
206 | end if | |
207 | ||
208 | -- Increment the position to the next element to parse | |
209 | 9 | if first_slash then |
210 | 5 | pos = first_slash |
211 | 4 | elsif qs_start then |
212 | 2 | pos = qs_start |
213 | else | |
214 | -- Nothing more to parse | |
215 | 2 | goto "parse_done" |
216 | end if | |
217 | ||
218 | label "parse_path" | |
219 | ||
220 | 7 | if not qs_start then |
221 | 3 | path = url[pos..$] |
222 | 3 | goto "parse_done" |
223 | end if | |
224 | ||
225 | -- Avoid getting a path when there is none. | |
226 | 4 | if pos != qs_start then |
227 | 2 | path = url[pos..qs_start - 1] |
228 | end if | |
229 | ||
230 | 4 | pos = qs_start |
231 | ||
232 | label "parse_query_string" | |
233 | ||
234 | 4 | query_string = url[qs_start + 1..$] |
235 | ||
236 | 4 | if querystring_also and length(query_string) then |
237 | 1 | query_string = parse_querystring(query_string) |
238 | end if | |
239 | ||
240 | label "parse_done" | |
241 | 9 | return { protocol, host_name, port, path, user_name, password, query_string } |
242 | end function | |
243 | ||
244 | --**** | |
245 | -- === URL encoding and decoding | |
246 | -- | |
247 | ||
248 | -- TODO: This is causing a creole parsing problem | |
249 | -- HTML form data is usually URL-encoded to package it into a GET or POST submission. | |
250 | -- In a nutshell, here's how you URL-encode the name-value pairs of the form data: | |
251 | -- # Convert all "unsafe" characters in the names and values to "%xx", where "xx" is the ascii | |
252 | -- value of the character, in hex. "Unsafe" characters include =, &, %, +, non-printable | |
253 | -- characters, and any others you want to encode-- there's no danger in encoding too many | |
254 | -- characters. For simplicity, you might encode all non-alphanumeric characters. | |
255 | -- A big nono is \n and \r chars in POST data. | |
256 | -- # Change all spaces to pluses. | |
257 | -- # String the names and values together with = and &, like | |
258 | -- name1=value1&name2=value2&name3=value3 | |
259 | -- # This string is your message body for POST submissions, or the query string for GET submissions. | |
260 | -- | |
261 | -- For example, if a form has a field called "name" that's set to "Lucy", and a field called "neighbors" | |
262 | -- that's set to "Fred & Ethel", the URL-encoded form data would be: | |
263 | -- | |
264 | -- name=Lucy&neighbors=Fred+%26+Ethel <<== note no \n or \r | |
265 | -- | |
266 | -- with a length of 34. | |
267 | ||
268 | constant | |
269 | 2 | alphanum = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz01234567890", |
270 | 2 | hexnums = "0123456789ABCDEF" |
271 | ||
272 | --** | |
273 | -- Converts all non-alphanumeric characters in a string to their | |
274 | -- percent-sign hexadecimal representation, or plus sign for | |
275 | -- spaces. | |
276 | -- | |
277 | -- Parameters: | |
278 | -- # ##what## : the string to encode | |
279 | -- # ##spacecode## : what to insert in place of a space | |
280 | -- | |
281 | -- Returns: | |
282 | -- A **sequence**, the encoded string. | |
283 | -- | |
284 | -- Comments: | |
285 | -- ##spacecode## defaults to ##+## as it is more correct, however, some sites | |
286 | -- want ##%20## as the space encoding. | |
287 | -- | |
288 | -- Example 1: | |
289 | -- | |
290 | -- puts(1, encode("Fred & Ethel")) | |
291 | -- -- Prints "Fred+%26+Ethel" | |
292 | -- | |
293 | -- | |
294 | -- See Also: | |
295 | -- [[:decode]] | |
296 | -- | |
297 | ||
298 | 2 | |
299 | 2 | sequence encoded = "" |
300 | 2 | object junk = "", junk1, junk2 |
301 | ||
302 | 2 | for idx = 1 to length(what) do |
303 | 26 | if find(what[idx],alphanum) then |
304 | 20 | encoded &= what[idx] |
305 | ||
306 | 6 | elsif equal(what[idx],' ') then |
307 | 4 | encoded &= spacecode |
308 | ||
309 | 2 | elsif 1 then |
310 | 2 | junk = what[idx] |
311 | 2 | junk1 = floor(junk / 16) |
312 | 2 | junk2 = floor(junk - (junk1 * 16)) |
313 | 2 | encoded &= "%" & hexnums[junk1+1] & hexnums[junk2+1] |
314 | end if | |
315 | 26 | end for |
316 | ||
317 | 2 | return encoded |
318 | end function | |
319 | ||
320 | --** | |
321 | -- Convert all encoded entities to their decoded counter parts | |
322 | -- | |
323 | -- Parameters: | |
324 | -- # ##what##: what value to decode | |
325 | -- | |
326 | -- Returns: | |
327 | -- A decoded sequence | |
328 | -- | |
329 | -- Example 1: | |
330 | -- | |
331 | -- puts(1, decode("Fred+%26+Ethel")) | |
332 | -- -- Prints "Fred & Ethel" | |
333 | -- | |
334 | -- | |
335 | -- See Also: | |
336 | -- [[:encode]] | |
337 | -- | |
338 | ||
339 | 2 | |
340 | 2 | integer k = 1 |
341 | ||
342 | 2 | while k <= length(what) do |
343 | 26 | if what[k] = '+' then |
344 | 2 | what[k] = ' ' -- space is a special case, converts into + |
345 | 24 | elsif what[k] = '%' then |
346 | 4 | if k = length(what) then |
347 | -- strip empty percent sign | |
348 | 0 | what = what[1..k-1] & what[k+1 .. $] |
349 | 4 | elsif k+1 = length(what) then |
350 | 0 | what[k] = value("#0" & what[k+1]) |
351 | 0 | what[k] = what[k][2] |
352 | 0 | what = what[1..k] & what[k+2 .. $] |
353 | else | |
354 | 4 | what[k] = value("#" & what[k+1..k+2]) |
355 | 4 | what[k] = what[k][2] |
356 | 4 | what = what[1..k] & what[k+3 .. $] |
357 | end if | |
358 | else | |
359 | -- do nothing if it is a regular char ('0' or 'A' or etc) | |
360 | end if | |
361 | ||
362 | 26 | k += 1 |
363 | 26 | end while |
364 | ||
365 | 2 | return what |
366 | end function |