libtld: /home/snapwebsites/snapcpp/contrib/libtld/src/tld.c Source File

libtld  1.5.13
A library to determine the Top-Level Domain name of any URL.
tld.c
Go to the documentation of this file.
1 /* TLD library -- TLD, domain name, and sub-domain extraction
2  * Copyright (c) 2011-2019 Made to Order Software Corp. All Rights Reserved
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the
6  * "Software"), to deal in the Software without restriction, including
7  * without limitation the rights to use, copy, modify, merge, publish,
8  * distribute, sublicense, and/or sell copies of the Software, and to
9  * permit persons to whom the Software is furnished to do so, subject to
10  * the following conditions:
11  *
12  * The above copyright notice and this permission notice shall be included
13  * in all copies or substantial portions of the Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
16  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
18  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
19  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
20  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
21  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22  */
23 
32 #include "libtld/tld.h"
33 #include "tld_data.h"
34 #if defined(MO_DARWIN)
35 # include <malloc/malloc.h>
36 #endif
37 #if !defined(MO_DARWIN) && !defined(MO_FREEBSD)
38 #include <malloc.h>
39 #endif
40 #include <stdlib.h>
41 #include <limits.h>
42 #include <string.h>
43 #include <ctype.h>
44 
45 #ifdef WIN32
46 #define strncasecmp _strnicmp
47 #endif
48 
328 static int cmp(const char *a, const char *b, int n)
329 {
330  /* if `a == "*"` then it always a match! */
331  if(a[0] == '*'
332  && a[1] == '\0')
333  {
334  return 0;
335  }
336 
337  /* n represents the maximum number of characters to check in b */
338  while(n > 0 && *a != '\0')
339  {
340  if(*a < *b)
341  {
342  return -1;
343  }
344  if(*a > *b)
345  {
346  return 1;
347  }
348  ++a;
349  ++b;
350  --n;
351  }
352  if(*a == '\0')
353  {
354  if(n > 0)
355  {
356  /* in this case n > 0 so b is larger */
357  return -1;
358  }
359  return 0;
360  }
361  /* in this case n == 0 so a is larger */
362  return 1;
363 }
364 
365 
396 int search(int i, int j, const char *domain, int n)
397 {
398  int p, r;
399  const struct tld_description *tld;
400 
401  while(i < j)
402  {
403  p = (j - i) / 2 + i;
404  tld = tld_descriptions + p;
405  r = cmp(tld->f_tld, domain, n);
406  if(r < 0)
407  {
408  /* eliminate the first half */
409  i = p + 1;
410  }
411  else if(r > 0)
412  {
413  /* eliminate the second half */
414  j = p;
415  }
416  else
417  {
418  /* match */
419  return p;
420  }
421  }
422 
423  return -1;
424 }
425 
426 
441 void tld_clear_info(struct tld_info *info)
442 {
445  info->f_country = (const char *) 0;
446  info->f_tld = (const char *) 0;
447  info->f_offset = -1;
448 }
449 
450 
555 enum tld_result tld(const char *uri, struct tld_info *info)
556 {
557  const char *end = uri;
558  const char **level_ptr;
559  int level = 0, start_level, i, r, p;
560  enum tld_result result;
561 
562  /* set defaults in the info structure */
563  tld_clear_info(info);
564 
565  if(uri == (const char *) 0 || uri[0] == '\0')
566  {
567  return TLD_RESULT_NULL;
568  }
569 
570  level_ptr = malloc(sizeof(const char *) * tld_max_level);
571 
572  while(*end != '\0')
573  {
574  if(*end == '.')
575  {
576  if(level >= tld_max_level)
577  {
578  /* At this point the maximum number of levels in the
579  * TLDs is 5
580  */
581  for(i = 1; i < tld_max_level; ++i)
582  {
583  level_ptr[i - 1] = level_ptr[i];
584  }
585  level_ptr[tld_max_level - 1] = end;
586  }
587  else
588  {
589  level_ptr[level] = end;
590  ++level;
591  }
592  if(level >= 2 && level_ptr[level - 2] + 1 == level_ptr[level - 1])
593  {
594  /* two periods one after another */
595  free(level_ptr);
596  return TLD_RESULT_BAD_URI;
597  }
598  }
599  ++end;
600  }
601  /* if level is not at least 1 then there are no period */
602  if(level == 0)
603  {
604  /* no TLD */
605  free(level_ptr);
606  return TLD_RESULT_NO_TLD;
607  }
608 
609  start_level = level;
610  --level;
612  level_ptr[level] + 1, (int) (end - level_ptr[level] - 1));
613  if(r == -1)
614  {
615  /* unknown */
616  free(level_ptr);
617  return TLD_RESULT_NOT_FOUND;
618  }
619 
620  /* check for the next level if there is one */
621  p = r;
622  while(level > 0 && tld_descriptions[r].f_start_offset != USHRT_MAX)
623  {
626  level_ptr[level - 1] + 1,
627  (int) (level_ptr[level] - level_ptr[level - 1] - 1));
628  if(r == -1)
629  {
630  /* we are done, return the previous level */
631  break;
632  }
633  p = r;
634  --level;
635  }
636 
637  /* if there are exceptions we may need to search those now if level is 0 */
638  if(level == 0)
639  {
642  uri,
643  (int) (level_ptr[0] - uri));
644  if(r != -1)
645  {
646  p = r;
647  }
648  }
649 
651  result = info->f_status == TLD_STATUS_VALID
654 
655  /* did we hit an exception? */
657  {
658  /* return the actual TLD and not the exception */
660  level = start_level - tld_descriptions[p].f_exception_level;
661  info->f_status = TLD_STATUS_VALID;
662  result = TLD_RESULT_SUCCESS;
663  }
664 
665  /* return a valid result */
668  info->f_tld = level_ptr[level];
669  info->f_offset = (int) (level_ptr[level] - uri);
670 
671  free(level_ptr);
672 
673  return result;
674 }
675 
676 
686 static int h2d(int c)
687 {
688  if(c >= 'a')
689  {
690  return c - 'a' + 10;
691  }
692  if(c >= 'A')
693  {
694  return c - 'A' + 10;
695  }
696  return c - '0';
697 }
698 
699 
741 enum tld_result tld_check_uri(const char *uri, struct tld_info *info, const char *protocols, int flags)
742 {
743  const char *p, *q, *username, *password, *host, *port, *n, *a, *query_string;
744  char domain[256];
745  int protocol_length, length, valid, c, i, j, anchor;
746  enum tld_result result;
747 
748  /* set defaults in the info structure */
749  tld_clear_info(info);
750 
751  if(uri == NULL || uri[0] == '\0')
752  {
753  return TLD_RESULT_NULL;
754  }
755 
756  /* check the protocol: [0-9A-Za-z_]+ */
757  for(p = uri; *uri != '\0' && *uri != ':'; ++uri)
758  {
759  if((*uri < 'a' || *uri > 'z')
760  && (*uri < 'A' || *uri > 'Z')
761  && (*uri < '0' || *uri > '9')
762  && *uri != '_')
763  {
764  return TLD_RESULT_BAD_URI;
765  }
766  }
767  valid = 0;
768  protocol_length = (int) (uri - p);
769  c = tolower(*p);
770  for(q = protocols; *q != '\0';)
771  {
772  if(q[0] == '*' && (q[1] == '\0' || q[1] == ','))
773  {
774  valid = 1;
775  break;
776  }
777  if(tolower(*q) == c)
778  {
779  if(strncasecmp(p, q, protocol_length) == 0
780  && (q[protocol_length] == '\0' || q[protocol_length] == ','))
781  {
782  valid = 1;
783  break;
784  }
785  }
786  /* move to the next protocol */
787  for(; *q != '\0' && *q != ','; ++q);
788  for(; *q == ','; ++q);
789  }
790  if(valid == 0)
791  {
792  return TLD_RESULT_BAD_URI;
793  }
794  if(uri[1] != '/' || uri[2] != '/')
795  {
796  return TLD_RESULT_BAD_URI;
797  }
798  uri += 3; /* skip the '://' */
799 
800  /* extract the complete domain name with sub-domains, etc. */
801  username = NULL;
802  host = uri;
803  for(; *uri != '/' && *uri != '\0'; ++uri)
804  {
805  if((unsigned char) *uri < ' ')
806  {
807  /* forbid control characters in domain name */
808  return TLD_RESULT_BAD_URI;
809  }
810  if(*uri == '@')
811  {
812  if(username != NULL)
813  {
814  /* two '@' signs is not possible */
815  return TLD_RESULT_BAD_URI;
816  }
817  username = host;
818  host = uri + 1;
819  }
820  else if(*uri & 0x80)
821  {
822  if(flags & VALID_URI_ASCII_ONLY)
823  {
824  /* only ASCII allowed by caller */
825  return TLD_RESULT_BAD_URI;
826  }
827  }
828  else if(*uri == ' ' || *uri == '+')
829  {
830  /* spaces not allowed in domain name */
831  return TLD_RESULT_BAD_URI;
832  }
833  else if(*uri == '%')
834  {
835  /* the next two digits must be hex
836  * note that the first digit must be at least 2 because
837  * we do not allow control characters
838  */
839  if(((uri[1] < '2' || uri[1] > '9')
840  && (uri[1] < 'a' || uri[1] > 'f')
841  && (uri[1] < 'A' || uri[1] > 'F'))
842  || ((uri[2] < '0' || uri[2] > '9')
843  && (uri[2] < 'a' || uri[2] > 'f')
844  && (uri[2] < 'A' || uri[2] > 'F')))
845  {
846  return TLD_RESULT_BAD_URI;
847  }
848  if(uri[1] == '2' && uri[2] == '0')
849  {
850  /* spaces not allowed in domain name */
851  return TLD_RESULT_BAD_URI;
852  }
853  if(uri[1] >= '8' && (flags & VALID_URI_ASCII_ONLY))
854  {
855  /* only ASCII allowed by caller */
856  return TLD_RESULT_BAD_URI;
857  }
858  /* skip the two digits right away */
859  uri += 2;
860  }
861  }
862  if(username != NULL)
863  {
864  password = username;
865  for(; *password != '@' && *password != ':'; ++password);
866  if(*password == ':')
867  {
868  if((host - 1) - (password + 1) <= 0)
869  {
870  /* empty password are not acceptable */
871  return TLD_RESULT_BAD_URI;
872  }
873  }
874  if(password - username - 1 <= 0)
875  {
876  /* username cannot be empty */
877  return TLD_RESULT_BAD_URI;
878  }
879  }
880  for(port = host; *port != ':' && port < uri; ++port);
881  if(*port == ':')
882  {
883  /* we have a port, it must be digits [0-9]+ */
884  for(n = port + 1; *n >= '0' && *n <= '9'; ++n);
885  if(n != uri || n == port + 1)
886  {
887  /* port is empty or includes invalid characters */
888  return TLD_RESULT_BAD_URI;
889  }
890  }
891 
892  /* check the address really quick */
893  query_string = NULL;
894  anchor = 0;
895  for(a = uri; *a != '\0'; ++a)
896  {
897  if((unsigned char) *a < ' ')
898  {
899  /* no control characters allowed */
900  return TLD_RESULT_BAD_URI;
901  }
902  else if(*a == '+' || *a == ' ') /* old space encoding */
903  {
904  if(flags & VALID_URI_NO_SPACES)
905  {
906  /* spaces not allowed by caller */
907  return TLD_RESULT_BAD_URI;
908  }
909  }
910  else if(*a == '?')
911  {
912  query_string = a + 1;
913  }
914  else if(*a == '&' && anchor == 0)
915  {
916  if(query_string == NULL)
917  {
918  /* & must be encoded if used before ? */
919  return TLD_RESULT_BAD_URI;
920  }
921  query_string = a + 1;
922  }
923  else if(*a == '=')
924  {
925  if(query_string != NULL && a - query_string == 0)
926  {
927  /* a query string variable name cannot be empty */
928  return TLD_RESULT_BAD_URI;
929  }
930  }
931  else if(*a == '#')
932  {
933  query_string = NULL;
934  anchor = 1;
935  }
936  else if(*a == '%')
937  {
938  /* the next two digits must be hex
939  * note that the first digit must be at least 2 because
940  * we do not allow control characters
941  */
942  if(((a[1] < '2' || a[1] > '9')
943  && (a[1] < 'a' || a[1] > 'f')
944  && (a[1] < 'A' || a[1] > 'F'))
945  || ((a[2] < '0' || a[2] > '9')
946  && (a[2] < 'a' || a[2] > 'f')
947  && (a[2] < 'A' || a[2] > 'F')))
948  {
949  return TLD_RESULT_BAD_URI;
950  }
951  if(a[1] == '2' && a[2] == '0' && (flags & VALID_URI_NO_SPACES))
952  {
953  /* spaces not allowed by caller */
954  return TLD_RESULT_BAD_URI;
955  }
956  if(a[1] >= '8' && (flags & VALID_URI_ASCII_ONLY))
957  {
958  /* only ASCII allowed by caller */
959  return TLD_RESULT_BAD_URI;
960  }
961  /* skip the two digits right away */
962  a += 2;
963  }
964  else if(*a & 0x80)
965  {
966  if(flags & VALID_URI_ASCII_ONLY)
967  {
968  /* only ASCII allowed by caller */
969  return TLD_RESULT_BAD_URI;
970  }
971  }
972  }
973 
974  /* check the domain */
975 
989  length = (int) (port - host);
990  if(length >= (int) (sizeof(domain) / sizeof(domain[0])))
991  {
992  /* sub-domains + domain + TLD is more than 255 characters?!
993  * note that the host main include many %XX characters but
994  * we ignore the fact here at this time; we could move this
995  * test in the for() loop below though.
996  */
997  return TLD_RESULT_BAD_URI;
998  }
999  if(length == 0)
1000  {
1001  /* although we could return TLD_RESULT_NULL it would not be
1002  * valid here because "http:///blah.com" is invalid, not NULL
1003  */
1004  return TLD_RESULT_BAD_URI;
1005  }
1006  for(i = 0, j = 0; i < length; ++i, ++j)
1007  {
1008  if(host[i] == '%') {
1009  domain[j] = (char) (h2d(host[i + 1]) * 16 + h2d(host[i + 2]));
1010  i += 2; /* skip the 2 digits */
1011  }
1012  else
1013  {
1014  domain[j] = host[i];
1015  }
1016  /* TODO: check that characters are acceptable in a domain name */
1017  }
1018  domain[j] = '\0';
1019  result = tld(domain, info);
1020  if(info->f_tld != NULL)
1021  {
1022  /* define the TLD inside the source string which "unfortunately"
1023  * is not null terminated by '\0'; also fix the offset since in
1024  * the complete URI the TLD is a bit further away
1025  */
1026  info->f_tld = host + info->f_offset;
1027  info->f_offset = (int) (info->f_tld - p);
1028  }
1029  return result;
1030 }
1031 
1032 
1043 const char *tld_version()
1044 {
1045  return LIBTLD_VERSION;
1046 }
1047 
1048 
1579 /* vim: ts=4 sw=4 et
1580  */
Special status to indicate an exception which is not directly a TLD.
Definition: tld.h:78
int search(int i, int j, const char *domain, int n)
Search for the specified domain.
Definition: tld.c:396
unsigned short f_exception_apply_to
This TLD is an exception of the "apply to" TLD.
Definition: tld_data.h:98
unsigned char f_category
One of the enum tld_category values.
Definition: tld_data.h:45
enum tld_status f_status
The status of the TLD.
Definition: tld.h:94
enum tld_result tld_check_uri(const char *uri, struct tld_info *info, const char *protocols, int flags)
Check that a URI is valid.
Definition: tld.c:741
const char * f_tld
Pointer to the TLD in the URI string you supplied.
Definition: tld.h:96
Declaration of the tld_description structure.
The input URI is empty.
Definition: tld.h:85
static int cmp(const char *a, const char *b, int n)
Compare two strings, one of which is limited by length.
Definition: tld.c:328
The public header of the libtld library.
enum tld_result tld(const char *uri, struct tld_info *info)
Get information about the TLD for the specified URI.
Definition: tld.c:555
const char * f_tld
The concerned TLD part without periods.
Definition: tld_data.h:121
static int h2d(int c)
Internal function used to transform XX values.
Definition: tld.c:686
int f_offset
The offset to the TLD in the URI string you supplied.
Definition: tld.h:97
const char * tld_version()
Return the version of the library.
Definition: tld.c:1043
unsigned short tld_end_offset
The end offset of the top-most top-level domain names.
Definition: tld_data.c:10272
The TLD was found, but it is marked as invalid.
Definition: tld.h:84
unsigned short f_start_offset
The offset to the start of the array of next TLD levels.
Definition: tld_data.h:70
Special status to indicate we did not find the TLD.
Definition: tld.h:73
unsigned short tld_start_offset
The start offset of the top-most top-level-domain names.
Definition: tld_data.c:10271
#define VALID_URI_NO_SPACES
Whether to check that the URI do not include any spaces.
const struct tld_description tld_descriptions[]
Array of all the TLDs defined in the tld_data.c file.
Definition: tld_data.c:537
unsigned char f_exception_level
The TLD level to be returned with this exception.
Definition: tld_data.h:108
The URI has a TLD that could not be determined.
Definition: tld.h:88
unsigned char f_status
One of the enum tld_status values.
Definition: tld_data.h:55
int tld_max_level
The maximum number of levels defined in the TLD table.
Definition: tld_data.c:10273
Description of a TLD.
Definition: tld_data.h:39
const char * f_country
The name of the country owning this extension.
Definition: tld_data.h:132
#define VALID_URI_ASCII_ONLY
Whether to check that the URI only includes ASCII.
The input URI has no TLD defined.
Definition: tld.h:86
The TLD was not found.
Definition: tld.h:62
void tld_clear_info(struct tld_info *info)
Clear the info structure.
Definition: tld.c:441
#define LIBTLD_VERSION
The version of the library as a string.
The TLD is currently valid.
Definition: tld.h:67
The URI includes characters that are not accepted by the function.
Definition: tld.h:87
tld_result
The result returned by tld().
Definition: tld.h:81
Success! The TLD of the specified URI is valid.
Definition: tld.h:83
const char * f_country
The country where this TLD is used.
Definition: tld.h:95
enum tld_category f_category
The category of the TLD.
Definition: tld.h:93
Set of information returned by the tld() function.
Definition: tld.h:91
unsigned short f_end_offset
The offset to the end of the array of next TLD levels.
Definition: tld_data.h:81

This document is part of the Snap! Websites Project.

Copyright by Made to Order Software Corp.

Syndicate content

Snap! Websites
An Open Source CMS System in C++

Contact Us Directly