libtld: /home/snapwebsites/snapcpp/contrib/libtld/tests/tld_test_domain_lowercase.c Source File

libtld  1.5.13
A library to determine the Top-Level Domain name of any URL.
tld_test_domain_lowercase.c
Go to the documentation of this file.
1 /* TLD library -- test converting domain names to lowercase
2  * Copyright (c) 2011-2019 Made to Order Software Corp. All Rights Reserved
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the
6  * "Software"), to deal in the Software without restriction, including
7  * without limitation the rights to use, copy, modify, merge, publish,
8  * distribute, sublicense, and/or sell copies of the Software, and to
9  * permit persons to whom the Software is furnished to do so, subject to
10  * the following conditions:
11  *
12  * The above copyright notice and this permission notice shall be included
13  * in all copies or substantial portions of the Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
16  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
18  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
19  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
20  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
21  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22  */
23 
31 #include "libtld/tld.h"
32 #include <string.h>
33 #include <stdlib.h>
34 #include <stdio.h>
35 #include <time.h>
36 #include <limits.h>
37 #include <wctype.h>
38 
39 int err_count = 0;
40 int verbose = 0;
41 
42 
43 
44 void test_add_byte(char **out, int wc, int force_caps)
45 {
46  if((wc >= 'A' && wc <= 'Z')
47  || (wc >= 'a' && wc <= 'z')
48  || (wc >= '0' && wc <= '9')
49  || wc == '.'
50  || wc == '-'
51  || wc == '!'
52  || wc == '~'
53  || wc == '/'
54  || wc == '_')
55  {
56  **out = wc;
57  ++*out;
58  }
59  else
60  {
61  // add '%XX' where X represents an hexadecimal digit
62  if(force_caps
63  || (rand() & 1) != 0)
64  {
65  sprintf(*out, "%%%02X", wc);
66  }
67  else
68  {
69  sprintf(*out, "%%%02x", wc);
70  }
71  *out += 3;
72  }
73 }
74 
75 
76 void test_to_utf8(char **out, int wc, int force_caps)
77 {
78  if(wc < 0x80)
79  {
80  test_add_byte(out, wc, force_caps);
81  }
82  else if(wc < 0x800)
83  {
84  test_add_byte(out, ((wc >> 6) | 0xC0), force_caps);
85  test_add_byte(out, ((wc & 0x3F) | 0x80), force_caps);
86  }
87  else if(wc < 0x10000)
88  {
89  test_add_byte(out, ((wc >> 12) | 0xE0), force_caps);
90  test_add_byte(out, (((wc >> 6) & 0x3F) | 0x80), force_caps);
91  test_add_byte(out, ((wc & 0x3F) | 0x80), force_caps);
92  }
93  else
94  {
95  test_add_byte(out, ((wc >> 18) | 0xF0), force_caps);
96  test_add_byte(out, (((wc >> 12) & 0x3F) | 0x80), force_caps);
97  test_add_byte(out, (((wc >> 6) & 0x3F) | 0x80), force_caps);
98  test_add_byte(out, ((wc & 0x3F) | 0x80), force_caps);
99  }
100 }
101 
102 
103 void test_all_characters()
104 {
105  int wc;
106  char buf[256], *s, *r;
107 
108  for(wc = 1; wc < 0x110000; ++wc)
109  {
110  if((wc >= 0xD800 && wc <= 0xDFFF) // UTF-16 stuff ignored
111  || (wc & 0xFFFF) == 0xFFFE
112  || (wc & 0xFFFF) == 0xFFFF
113  || wc == '/')
114  {
115  // those code points must be ignored because they
116  // really don't work in a domain name
117  continue;
118  }
119 
120  s = buf;
121  test_to_utf8(&s, wc, 0);
122  *s = '\0';
123 
124  r = tld_domain_to_lowercase(buf);
125 
126  s = buf;
127  test_to_utf8(&s, towlower(wc), 1); // force caps in %XX notication
128  *s = '\0';
129 
130  if(r == NULL)
131  {
132  fprintf(stderr, "error: character 0x%06X generated an error and tld_domain_to_lower() returned NULL (expected \"%s\")\n", wc, buf);
133  }
134  else
135  {
136  if(strcmp(r, buf) != 0)
137  {
138  fprintf(stderr, "error: character 0x%06X was not converted back and force as expected (expected \"%s\", received \"%s\")\n", wc, buf, r);
139  }
140 
141  // done with the result
142  free(r);
143  }
144  }
145 }
146 
147 
148 void test_empty()
149 {
150  char *r;
151 
152  // NULL as input, returns NULL
153  r = tld_domain_to_lowercase(NULL);
154  if(r != NULL)
155  {
156  ++err_count;
157  fprintf(stderr, "error: tld_domain_to_lowercase(NULL) is expected to return NULL.\n");
158  }
159 
160  // an empty string also returns NULL as result
161  r = tld_domain_to_lowercase("");
162  if(r != NULL)
163  {
164  ++err_count;
165  fprintf(stderr, "error: tld_domain_to_lowercase(\"\") is expected to return NULL.\n");
166  }
167 }
168 
169 
170 void test_invalid_xx()
171 {
172  char *r;
173  char buf[256];
174  int i;
175 
176  r = tld_domain_to_lowercase("%AZ");
177  if(r != NULL)
178  {
179  ++err_count;
180  fprintf(stderr, "error: tld_domain_to_lowercase(\"%%AZ\") is expected to return NULL.\n");
181  }
182 
183  r = tld_domain_to_lowercase("%ZA");
184  if(r != NULL)
185  {
186  ++err_count;
187  fprintf(stderr, "error: tld_domain_to_lowercase(\"%%ZA\") is expected to return NULL.\n");
188  }
189 
190  // these are 3 x a with an acute accent (as used in Spanish)
191  r = tld_domain_to_lowercase("\xC3\xA1\xC3\xA1\xC3\xA1");
192  if(r != NULL)
193  {
194  ++err_count;
195  fprintf(stderr, "error: tld_domain_to_lowercase(\"\xC3\xA1\xC3\xA1\xC3\xA1\") is expected to return NULL because of an overflow.\n");
196  }
197 
198  // these are 2 x a with an acute accent followed by "ab"
199  // this time the overflow happens when the 'a' is hit
200  r = tld_domain_to_lowercase("\xC3\xA1\xC3\xA1\x61\x62");
201  if(r != NULL)
202  {
203  ++err_count;
204  fprintf(stderr, "error: tld_domain_to_lowercase(\"\xC3\xA1\xC3\xA1\x61\x62\") is expected to return NULL because of an overflow.\n");
205  }
206 
207  // these are 3 x 0x0911 (Devanagari letter candra o)
208  r = tld_domain_to_lowercase("\xE0\xA4\x91\xE0\xA4\x91\xE0\xA4\x91");
209  if(r != NULL)
210  {
211  ++err_count;
212  fprintf(stderr, "error: tld_domain_to_lowercase(\"\xE0\xA4\x91\xE0\xA4\x91\xE0\xA4\x91\") is expected to return NULL because of an overflow.\n");
213  }
214 
215  // these are 2 x 0x0911 and a # in between (Devanagari letter candra o)
216  r = tld_domain_to_lowercase("\xE0\xA4\x91#\xE0\xA4\x91");
217  if(r != NULL)
218  {
219  ++err_count;
220  fprintf(stderr, "error: tld_domain_to_lowercase(\"\xE0\xA4\x91#\xE0\xA4\x91\") is expected to return NULL because of an overflow.\n");
221  }
222 
223  // these are 2 x 0x0911 and a q in between (Devanagari letter candra o)
224  r = tld_domain_to_lowercase("\xE0\xA4\x91q\xE0\xA4\x91");
225  if(r != NULL)
226  {
227  ++err_count;
228  fprintf(stderr, "error: tld_domain_to_lowercase(\"\xE0\xA4\x91q\xE0\xA4\x91\") is expected to return NULL because of an overflow.\n");
229  }
230 
231  // these are 3 x 0x13F0B (Miao letter da)
232  r = tld_domain_to_lowercase("\xF0\x96\xBC\x8B\xF0\x96\xBC\x8B\xF0\x96\xBC\x8B");
233  if(r != NULL)
234  {
235  ++err_count;
236  fprintf(stderr, "error: tld_domain_to_lowercase(\"\xF0\x96\xBC\x8B\xF0\x96\xBC\x8B\xF0\x96\xBC\x8B\") is expected to return NULL because of an overflow.\n");
237  }
238 
239  // these are 2 x 0x13F0B with # in between (Miao letter da)
240  r = tld_domain_to_lowercase("\xF0\x96\xBC\x8B#\xF0\x96\xBC\x8B");
241  if(r != NULL)
242  {
243  ++err_count;
244  fprintf(stderr, "error: tld_domain_to_lowercase(\"\xF0\x96\xBC\x8B#\xF0\x96\xBC\x8B\") is expected to return NULL because of an overflow.\n");
245  }
246 
247  // these are 2 x 0x13F0B with q in between (Miao letter da)
248  r = tld_domain_to_lowercase("\xF0\x96\xBC\x8Bq\xF0\x96\xBC\x8B");
249  if(r != NULL)
250  {
251  ++err_count;
252  fprintf(stderr, "error: tld_domain_to_lowercase(\"\xF0\x96\xBC\x8Bq\xF0\x96\xBC\x8B\") is expected to return NULL because of an overflow.\n");
253  }
254 
255  // these are 2 x 0x13F0B with qq in between (Miao letter da)
256  r = tld_domain_to_lowercase("\xF0\x96\xBC\x8Bqq\xF0\x96\xBC\x8B");
257  if(r != NULL)
258  {
259  ++err_count;
260  fprintf(stderr, "error: tld_domain_to_lowercase(\"\xF0\x96\xBC\x8Bqq\xF0\x96\xBC\x8B\") is expected to return NULL because of an overflow.\n");
261  }
262 
263  // these are 2 x 0x13F0B with qqq in between (Miao letter da)
264  r = tld_domain_to_lowercase("\xF0\x96\xBC\x8Bqqq\xF0\x96\xBC\x8B");
265  if(r != NULL)
266  {
267  ++err_count;
268  fprintf(stderr, "error: tld_domain_to_lowercase(\"\xF0\x96\xBC\x8Bqqq\xF0\x96\xBC\x8B\") is expected to return NULL because of an overflow.\n");
269  }
270 
271  for(i = 0xF8; i <= 0xFF; ++i)
272  {
273  snprintf(buf, sizeof(buf), "+%%%02X+", i);
274 
275  r = tld_domain_to_lowercase(buf);
276  if(r != NULL)
277  {
278  ++err_count;
279  fprintf(stderr, "error: tld_domain_to_lowercase(\"%s\") is expected to return NULL because of the invalid byte (introduction byte).\n", buf);
280  }
281  }
282 
283  for(i = 0x80; i <= 0xBF; ++i)
284  {
285  snprintf(buf, sizeof(buf), "+%%%02X+", i);
286 
287  r = tld_domain_to_lowercase(buf);
288  if(r != NULL)
289  {
290  ++err_count;
291  fprintf(stderr, "error: tld_domain_to_lowercase(\"%s\") is expected to return NULL because of the invalid byte (continuation byte).\n", buf);
292  }
293  }
294 
295  // byte missing (end of string found before end of UTF-8 character)
296  for(i = 0xC0; i <= 0xF7; ++i)
297  {
298  buf[0] = i;
299  buf[1] = '\0';
300  r = tld_domain_to_lowercase(buf);
301  if(r != NULL)
302  {
303  ++err_count;
304  fprintf(stderr, "error: tld_domain_to_lowercase(\"%s\") is expected to return NULL because of the invalid UTF-8 sequence (end of string found too early).\n", buf);
305  }
306  }
307 
308  // continuation byte out of range
309  for(i = 0x00; i <= 0xFF; ++i)
310  {
311  if(i >= 0x80 && i <= 0xBF)
312  {
313  // that's a valid continuation
314  continue;
315  }
316  buf[0] = rand() % (0xF8 - 0xC0) + 0xC0;
317  buf[1] = i;
318  buf[2] = '\0';
319  r = tld_domain_to_lowercase(buf);
320  if(r != NULL)
321  {
322  ++err_count;
323  fprintf(stderr, "error: tld_domain_to_lowercase(\"%s\") is expected to return NULL because of the invalid UTF-8 sequence (continuation byte out of range).\n", buf);
324  }
325  }
326 
327  for(i = 1; i < 0x110000; ++i)
328  {
329  if((i >= 0xD800 && i <= 0xDFFF) // UTF-16 stuff ignored
330  || (i & 0xFFFF) == 0xFFFE
331  || (i & 0xFFFF) == 0xFFFF)
332  {
333  r = buf;
334  test_to_utf8(&r, i, rand() & 1);
335  *r = '\0';
336 
337  r = tld_domain_to_lowercase(buf);
338  if(r != NULL)
339  {
340  ++err_count;
341  fprintf(stderr, "error: tld_domain_to_lowercase(\"%s\") is expected to return NULL because of the invalid UTF-8 sequence (continuation byte out of range).\n", buf);
342  }
343  }
344  }
345 
346  for(i = 0x110000;; ++i)
347  {
348  r = buf;
349  test_to_utf8(&r, i, rand() & 1);
350  *r = '\0';
351 
352  // we only save up to 4 bytes, so to check overflow, we expect %F0
353  // as the first byte...
354  if(strncmp(buf, "%f0", 3) == 0
355  || strncmp(buf, "%F0", 3) == 0)
356  {
357  // no need to test further, we hit the case of 0xF8 or more in
358  // the first byte which is checked somewhere else
359  break;
360  }
361 
362  r = tld_domain_to_lowercase(buf);
363  if(r != NULL)
364  {
365  ++err_count;
366  fprintf(stderr, "error: tld_domain_to_lowercase(\"%s\") is expected to return NULL because of the invalid Unicode character. Got \"%s\" instead.\n", buf, r);
367  }
368  }
369 }
370 
371 
372 int main(int argc, char *argv[])
373 {
374  int i;
375  int seed = time(NULL);
376 
377  for(i = 1; i < argc; ++i)
378  {
379  if(strcmp(argv[i], "-v") == 0)
380  {
381  verbose = 1;
382  }
383  else if(strcmp(argv[i], "--seed") == 0)
384  {
385  if(i + 1 >= argc)
386  {
387  fprintf(stderr, "error: --seed expect a value.\n");
388  exit(1);
389  }
390  seed = atol(argv[i + 1]);
391  }
392  }
393 
394  printf("testing tld test domain lowercase version %s with seed %d\n", tld_version(), seed);
395 
396  srand(seed);
397 
398  test_empty();
399  test_all_characters();
400  test_invalid_xx();
401 
402  exit(err_count ? 1 : 0);
403 }
404 
405 /* vim: ts=4 sw=4 et
406  */
407 
int verbose
Whether the user asked for verbosity, false by default.
int main(int argc, char *argv[])
Console tool to generate the tld_data.c file.
Definition: tld_parser.cpp:907
LIBTLD_EXPORT char * tld_domain_to_lowercase(const char *domain)
Transform a domain with a TLD to lowercase before processing.
The public header of the libtld library.
LIBTLD_EXPORT const char * tld_version()
Return the version of the library.
Definition: tld.c:1043
int err_count
Number of errors so we know whether to exit with 0 or 1.
QTextStream out
The output text stream that writes inside the output file.
Definition: tld_parser.cpp:611

This document is part of the Snap! Websites Project.

Copyright by Made to Order Software Corp.

Syndicate content

Snap! Websites
An Open Source CMS System in C++

Contact Us Directly