as2js: /home/snapwebsites/snapcpp/contrib/as2js/lib/string.cpp Source File

as2js  0.1.14
AlexScript to JavaScript
string.cpp
Go to the documentation of this file.
1 /* lib/string.cpp
2 
3 Copyright (c) 2005-2019 Made to Order Software Corp. All Rights Reserved
4 
6 
7 Permission is hereby granted, free of charge, to any
8 person obtaining a copy of this software and
9 associated documentation files (the "Software"), to
10 deal in the Software without restriction, including
11 without limitation the rights to use, copy, modify,
12 merge, publish, distribute, sublicense, and/or sell
13 copies of the Software, and to permit persons to whom
14 the Software is furnished to do so, subject to the
15 following conditions:
16 
17 The above copyright notice and this permission notice
18 shall be included in all copies or substantial
19 portions of the Software.
20 
21 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
22 ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT
23 LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
24 FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO
25 EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
26 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
27 WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
28 ARISING FROM, OUT OF OR IN CONNECTION WITH THE
29 SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 SOFTWARE.
31 
32 */
33 
34 #include "as2js/string.h"
35 #include "as2js/exceptions.h"
36 
37 #include <limits>
38 
39 
51 namespace as2js
52 {
53 
54 
55 
61  : basic_string()
62 {
63 }
64 
65 
89 String::String(char const *str, int len)
90  : basic_string()
91 {
92  from_char(str, len);
93 }
94 
95 
117 String::String(wchar_t const *str, int len)
118  : basic_string()
119 {
120  from_wchar(str, len);
121 }
122 
123 
144 String::String(as_char_t const *str, int len)
145  : basic_string()
146 {
148  {
149  throw exception_internal_error("String::String() called with an invalid input string");
150  }
151 }
152 
153 
166 String::String(std::string const& str)
167  : basic_string()
168 {
169  from_char(str.c_str(), static_cast<int>(str.length()));
170 }
171 
172 
185 String::String(std::wstring const& str)
186  : basic_string()
187 {
188  from_wchar(str.c_str(), static_cast<int>(str.length()));
189 }
190 
191 
200 String::String(std::basic_string<as_char_t> const& str)
201  : basic_string(str)
202 {
203 }
204 
205 
218 String& String::operator = (char const *str)
219 {
220  from_char(str);
221  return *this;
222 }
223 
224 
237 String& String::operator = (wchar_t const *str)
238 {
239  from_wchar(str);
240  return *this;
241 }
242 
243 
256 String& String::operator = (std::string const& str)
257 {
258  from_char(str.c_str(), static_cast<int>(str.length()));
259  return *this;
260 }
261 
262 
275 String& String::operator = (std::wstring const& str)
276 {
277  from_wchar(str.c_str(), static_cast<int>(str.length()));
278  return *this;
279 }
280 
281 
292 String& String::operator = (std::basic_string<as_char_t> const& str)
293 {
294  basic_string<as_char_t>::operator = (str);
295  return *this;
296 }
297 
298 
309 String& String::operator += (char const *str)
310 {
311  String s(str);
312  basic_string<as_char_t>::operator += (s);
313  return *this;
314 }
315 
316 
329 String& String::operator += (wchar_t const *str)
330 {
331  String s(str);
332  basic_string<as_char_t>::operator += (s);
333  return *this;
334 }
335 
336 
348 {
349  basic_string<as_char_t>::operator += (str);
350  return *this;
351 }
352 
353 
364 String& String::operator += (std::string const& str)
365 {
366  String s(str);
367  basic_string<as_char_t>::operator += (s);
368  return *this;
369 }
370 
371 
382 String& String::operator += (std::wstring const& str)
383 {
384  String s(str);
385  basic_string<as_char_t>::operator += (s);
386  return *this;
387 }
388 
389 
400 String& String::operator += (std::basic_string<as_char_t> const& str)
401 {
402  basic_string<as_char_t>::operator += (str);
403  return *this;
404 }
405 
406 
418 {
419  basic_string<as_char_t>::operator += (c);
420  return *this;
421 }
422 
423 
435 {
436  basic_string<as_char_t>::operator += (static_cast<as_char_t>(static_cast<unsigned char>(c)));
437  return *this;
438 }
439 
440 
457 String& String::operator += (wchar_t const c)
458 {
459  // TODO: cannot add surrogate in this way?
460  // (under MS-Windows, where wchar_t is 16 bits, this would be
461  // the only way to add large characters with wchar_t... we could
462  // save leads and when a tail arrives convert the character, but
463  // that's rather unsafe...)
464  if(valid_character(c))
465  {
466  basic_string<as_char_t>::operator += (static_cast<as_char_t>(c));
467  }
468  return *this;
469 }
470 
471 
472 // /** \brief Concatenate a String and a C-string.
473 // *
474 // * This function concatenate this String and a standard C-string.
475 // *
476 // * \note
477 // * This function creates a copy of the string. If you can, try to
478 // * use the += operator instead.
479 // *
480 // * \param[in] str The string to concatenate at the end of this String.
481 // *
482 // * \return A new string with the concatenated result.
483 // */
484 // String String::operator + (char const * str)
485 // {
486 // String result(*this);
487 // return result += str;
488 // }
489 //
490 //
491 // /** \brief Concatenate a String and a wide C-string.
492 // *
493 // * This function concatenate this String and a standard wide C-string.
494 // *
495 // * \note
496 // * This function creates a copy of the string. If you can, try to
497 // * use the += operator instead.
498 // *
499 // * \param[in] str The string to concatenate at the end of this String.
500 // *
501 // * \return A new string with the concatenated result.
502 // */
503 // String String::operator + (wchar_t const * str)
504 // {
505 // String result(*this);
506 // return result += str;
507 // }
508 //
509 //
510 // /** \brief Concatenate a String and a C-like string made of as_char_t characters.
511 // *
512 // * This function concatenate this String and a C-link string made of
513 // * as_char_t characters. The array must be null terminated (\0).
514 // *
515 // * \note
516 // * This function creates a copy of the string. If you can, try to
517 // * use the += operator instead.
518 // *
519 // * \param[in] str The string to concatenate at the end of this String.
520 // *
521 // * \return A new string with the concatenated result.
522 // */
523 // String String::operator + (as_char_t const * str)
524 // {
525 // String result(*this);
526 // return result += str;
527 // }
528 //
529 //
530 // /** \brief Concatenate a String and a C++ string.
531 // *
532 // * This function concatenate this String and a C++ string.
533 // *
534 // * \note
535 // * This function creates a copy of the string. If you can, try to
536 // * use the += operator instead.
537 // *
538 // * \param[in] str The string to concatenate at the end of this String.
539 // *
540 // * \return A new string with the concatenated result.
541 // */
542 // String String::operator + (std::string const & str)
543 // {
544 // String result(*this);
545 // return result += str;
546 // }
547 //
548 //
549 // /** \brief Concatenate a String and a C++ wide string.
550 // *
551 // * This function concatenate this String and a C++ wide string.
552 // *
553 // * \note
554 // * This function creates a copy of the string. If you can, try to
555 // * use the += operator instead.
556 // *
557 // * \param[in] str The string to concatenate at the end of this String.
558 // *
559 // * \return A new string with the concatenated result.
560 // */
561 // String String::operator + (std::wstring const & str)
562 // {
563 // String result(*this);
564 // return result += str;
565 // }
566 
567 
589 {
590  clear();
591  if(str != nullptr)
592  {
593  if(len == -1)
594  {
595  for(; *str != '\0'; ++str)
596  {
597  append(1, static_cast<unsigned char>(*str));
598  }
599  }
600  else
601  {
602  for(; len > 0 && *str != '\0'; --len, ++str)
603  {
604  append(1, static_cast<unsigned char>(*str));
605  }
606  }
607  }
608 
610 }
611 
612 
639 {
640  struct out
641  {
643  {
644  if(c >= 0xD800 && c < 0xDC00)
645  {
646  f_lead_surrogate = c;
647  return conversion_result_t::STRING_END; // not an error unless it was the last character
648  }
649  else if(c >= 0xDC00 && c <= 0xDFFF)
650  {
651  if(f_lead_surrogate == 0)
652  {
653  // invalid encoding
655  }
656  c = (((static_cast<as_char_t>(f_lead_surrogate) & 0x03FF) << 10) | (static_cast<as_char_t>(c) & 0x03FF)) + 0x10000;
657  // Note: UTF-16 characters cannot be invalid here
658  // (unless we add code points such as 0xFFFE and 0xFFFF
659  // among invalid characters)
660  if(!f_string.valid_character(c))
661  {
662  return conversion_result_t::STRING_INVALID; // LCOV_EXCL_LINE
663  }
664  f_lead_surrogate = 0;
665  }
666  f_string.append(1, c);
668  }
669 
670  String f_string = String();
671  as_char_t f_lead_surrogate = 0;
672  };
673 
674  out o;
676  if(str != nullptr)
677  {
678  if(len == -1)
679  {
680  for(; *str != '\0'; ++str)
681  {
682  result = o.add(*str);
684  {
685  break;
686  }
687  }
688  }
689  else
690  {
691  for(; len > 0 && *str != '\0'; --len, ++str)
692  {
693  result = o.add(*str);
695  {
696  break;
697  }
698  }
699  }
700  }
701 
703  {
704  *this = o.f_string;
705  }
706 
707  return result;
708 }
709 
710 
734 {
735  String s;
736  if(str != nullptr)
737  {
738  if(len == -1)
739  {
740  for(; *str != '\0'; ++str)
741  {
742  if(!valid_character(*str))
743  {
745  }
746  s.append(1, *str);
747  }
748  }
749  else
750  {
751  for(; len > 0 && *str != '\0'; --len, ++str)
752  {
753  if(!valid_character(*str))
754  {
756  }
757  s.append(1, *str);
758  }
759  }
760  }
761 
762  *this = s;
763 
765 }
766 
767 
793 {
794  String result;
795  unsigned char c;
796  as_char_t w;
797  int l;
798 
799  if(str != nullptr)
800  {
801  if(len == -1)
802  {
803  // it's a bit of a waste, but makes it a lot easier
804  len = std::char_traits<char>::length(str);
805  }
806 
807  while(len > 0)
808  {
809  --len;
810  c = static_cast<unsigned char>(*str++);
811 
812  if(c < 0x80)
813  {
814  w = c;
815  }
816  else
817  {
818  if(c >= 0xC0 && c <= 0xDF)
819  {
820  l = 1;
821  w = c & 0x1F;
822  }
823  else if(c >= 0xE0 && c <= 0xEF)
824  {
825  l = 2;
826  w = c & 0x0F;
827  }
828  else if(c >= 0xF0 && c <= 0xF7)
829  {
830  l = 3;
831  w = c & 0x07;
832  }
833  // The following are not valid UTF-8 characters, these are
834  // refused below as we verify the validity of the character
835  else if(c >= 0xF8 && c <= 0xFB)
836  {
837  l = 4;
838  w = c & 0x03;
839  }
840  else if(c >= 0xFC && c <= 0xFD)
841  {
842  l = 5;
843  w = c & 0x01;
844  }
845  else
846  {
847  // invalid UTF-8 sequence
849  }
850  if(len < l)
851  {
852  // not enough character
854  }
855  len -= l;
856  while(l > 0)
857  {
858  c = static_cast<unsigned char>(*str++);
859  if(c < 0x80 || c > 0xBF)
860  {
862  }
863  l--;
864  w = (w << 6) | (c & 0x3F);
865  }
866  }
867  if(!valid_character(w))
868  {
870  }
871  result.append(1, w);
872  }
873  }
874 
875  // it worked, we can smash this String
876  *this = result;
877 
879 }
880 
881 
892 {
893  return from_utf8(str.c_str(), str.length());
894 }
895 
896 
907 bool String::operator == (char const *str) const
908 {
909  String s(str);
910  return *this == s;
911 }
912 
913 
925 bool operator == (char const *str, String const& string)
926 {
927  String s(str);
928  return s == string;
929 }
930 
931 
942 bool String::operator != (char const *str) const
943 {
944  String s(str);
945  return *this != s;
946 }
947 
948 
960 bool operator != (char const *str, String const& string)
961 {
962  String s(str);
963  return s != string;
964 }
965 
966 
986 bool String::valid() const
987 {
988  for(as_char_t const *s(c_str()); *s != '\0'; ++s)
989  {
990  if(!valid_character(*s))
991  {
992  return false;
993  }
994  }
995 
996  return true;
997 }
998 
999 
1016 {
1017  // Note: as_char_t is an int32_t (i.e. a signed value)
1018  return (c < 0xD800 || c > 0xDFFF) // UTF-16 surrogates
1019  && c < 0x110000 // too large?
1020  && c >= 0; // too small?
1021 }
1022 
1023 
1037 bool String::is_int64() const
1038 {
1039  struct hex_test
1040  {
1041  static bool is_hex(as_char_t c)
1042  {
1043  return (c >= '0' && c <= '9')
1044  || (c >= 'a' && c <= 'f')
1045  || (c >= 'A' && c <= 'F');
1046  }
1047  };
1048 
1049  as_char_t const *s(c_str());
1050 
1051  // sign
1052  // TODO: in strict mode hexadecimal numbers cannot be signed
1053  if(*s == '-' || *s == '+')
1054  {
1055  ++s;
1056  }
1057 
1058  // handle special case of hexadecimal
1059  if(*s == '0')
1060  {
1061  ++s;
1062  if(*s == 'x' || *s == 'X')
1063  {
1064  if(s[1] == '\0')
1065  {
1066  // just "0x" or "0X" is not a valid number
1067  return false;
1068  }
1069  for(++s; hex_test::is_hex(*s); ++s);
1070  return *s == '\0';
1071  }
1072  // no octal support in strings
1073  }
1074 
1075  // number
1076  for(; *s >= '0' && *s <= '9'; ++s);
1077 
1078  return *s == '\0';
1079 }
1080 
1081 
1106 {
1107  as_char_t const *s(c_str());
1108 
1109  // sign
1110  if(*s == '-' || *s == '+')
1111  {
1112  ++s;
1113  }
1114 
1115  // integral part
1116  for(; *s >= '0' && *s <= '9'; ++s);
1117 
1118  // if '.' check for a decimal part
1119  if(*s == '.')
1120  {
1121  for(++s; *s >= '0' && *s <= '9'; ++s);
1122  }
1123 
1124  // if 'e' check for an exponent
1125  if(*s == 'e' || *s == 'E')
1126  {
1127  ++s;
1128  if(*s == '+' || *s == '-')
1129  {
1130  // skip the sign
1131  ++s;
1132  }
1133  for(; *s >= '0' && *s <= '9'; ++s);
1134  }
1135 
1136  return *s == '\0';
1137 }
1138 
1139 
1155 bool String::is_number() const
1156 {
1157  // floats support integers so this is true if this string is an int64
1158  return is_int64() || is_float64();
1159 }
1160 
1161 
1182 {
1183  if(empty())
1184  {
1185  return 0;
1186  }
1187 
1188  if(is_int64())
1189  {
1190  // Check whether it is an hexadecimal number, because if so
1191  // we use base 16. We want to force the base because we do
1192  // not support base 8 which std::stoll() could otherwise
1193  // switch to when we have a number that starts with zero.
1194  as_char_t const *s(c_str());
1195  if(*s == '+' || *s == '-')
1196  {
1197  ++s;
1198  }
1199  if(s[0] == '0' && (s[1] == 'x' || s[1] == 'X'))
1200  {
1201  // the strtoll() function supports the sign
1202  return std::stoll(to_utf8(), nullptr, 16);
1203  }
1204  return std::stoll(to_utf8(), nullptr, 10);
1205  }
1206 
1207  // this is invalid
1208  throw exception_internal_error("String::to_int64() called with an invalid integer");
1209 }
1210 
1211 
1231 {
1232  if(empty())
1233  {
1234  return 0.0;
1235  }
1236 
1237  if(is_float64())
1238  {
1239  return std::stod(to_utf8(), 0);
1240  }
1241 
1242  return std::numeric_limits<double>::quiet_NaN();
1243 }
1244 
1245 
1253 bool String::is_true() const
1254 {
1255  if(empty())
1256  {
1257  return false;
1258  }
1259 // Not too sure where I picked that up, but the documentation clearly says
1260 // that an empty string is false, anything else is true...
1261 // if(is_int64())
1262 // {
1263 // return to_int64() != 0;
1264 // }
1265 // if(is_float64())
1266 // {
1267 //#pragma GCC diagnostic push
1268 //#pragma GCC diagnostic ignored "-Wfloat-equal"
1269 // return strtod(to_utf8().c_str(), 0) != 0.0;
1270 //#pragma GCC diagnostic pop
1271 // }
1272  return true;
1273 }
1274 
1275 
1283 ssize_t String::utf8_length() const
1284 {
1285  ssize_t r(0);
1286  as_char_t c;
1287 
1288  for(as_char_t const *wc(c_str()); *wc != '\0'; ++wc)
1289  {
1290  // get one wide character
1291  c = *wc;
1292  if(!valid_character(c))
1293  {
1294  // character is not valid UTF-32
1295  return -1;
1296  }
1297 
1298  // simulate encoding
1299  if(c < 0x80)
1300  {
1301  r += 1;
1302  }
1303  else if(c < 0x800)
1304  {
1305  r += 2;
1306  }
1307  else if(c < 0x10000)
1308  {
1309  r += 3;
1310  }
1311  else //if(c < 0x200000)
1312  {
1313  r += 4;
1314  }
1315  }
1316 
1317  return r;
1318 }
1319 
1320 
1343 std::string String::to_utf8() const
1344 {
1345  std::string result;
1346  as_char_t c;
1347 
1348  // make sure we always have a null at the end...
1349  for(as_char_t const *wc = c_str(); *wc != '\0'; ++wc)
1350  {
1351  // get one wide character
1352  c = *wc;
1353  if(valid_character(c))
1354  {
1355  // only encode characters considered valid
1356  if(c < 0x80)
1357  {
1358  /* this will also encode '\0'... */
1359  result.append(1, c);
1360  }
1361  else if(c < 0x800)
1362  {
1363  result.append(1, (c >> 6) | 0xC0);
1364  result.append(1, (c & 0x3F) | 0x80);
1365  }
1366  else if(c < 0x10000)
1367  {
1368  result.append(1, (c >> 12) | 0xE0);
1369  result.append(1, ((c >> 6) & 0x3F) | 0x80);
1370  result.append(1, (c & 0x3F) | 0x80);
1371  }
1372  else
1373  {
1374  result.append(1, (c >> 18) | 0xF0);
1375  result.append(1, ((c >> 12) & 0x3F) | 0x80);
1376  result.append(1, ((c >> 6) & 0x3F) | 0x80);
1377  result.append(1, (c & 0x3F) | 0x80);
1378  }
1379  }
1380  }
1381 
1382  return result;
1383 }
1384 
1385 
1401 {
1402  String result;
1403 
1404  // TBD: should we limit the space check to spaces recognized by EMCAScript?
1405  as_char_t const *wc = c_str();
1406  while(*wc != '\0' && iswspace(*wc))
1407  {
1408  ++wc;
1409  }
1410 
1411  // accept a signed number
1412  if(*wc == '-' || *wc == '+')
1413  {
1414  result += *wc;
1415  ++wc;
1416  }
1417  if(*wc >= '0' && *wc <= '9')
1418  {
1419  // read the number, ignore the rest
1420  result += *wc;
1421  ++wc;
1422  while(*wc >= '0' && *wc <= '9')
1423  {
1424  result += *wc;
1425  ++wc;
1426  }
1427  if(*wc == '.')
1428  {
1429  result += *wc;
1430  ++wc;
1431  while(*wc >= '0' && *wc <= '9')
1432  {
1433  result += *wc;
1434  ++wc;
1435  }
1436  if(*wc == 'e' || *wc == 'E')
1437  {
1438  result += *wc;
1439  ++wc;
1440  if(*wc == '+' || *wc == '-')
1441  {
1442  result += *wc;
1443  ++wc;
1444  }
1445  while(*wc >= '0' && *wc <= '9')
1446  {
1447  result += *wc;
1448  ++wc;
1449  }
1450  }
1451  }
1452  // ignore anything else
1453  }
1454  else
1455  {
1456  // read the string, but simplify the spaces
1457  bool found_space(false);
1458  for(; *wc != '\0'; ++wc)
1459  {
1460  if(iswspace(*wc))
1461  {
1462  found_space = true;
1463  }
1464  else
1465  {
1466  if(found_space)
1467  {
1468  result += ' ';
1469  found_space = false;
1470  }
1471  result += *wc;
1472  }
1473  }
1474  }
1475 
1476  if(result.empty())
1477  {
1478  // make an empty string similar to zero
1479  result = "0";
1480  }
1481 
1482  return result;
1483 }
1484 
1485 
1496 std::ostream& operator << (std::ostream& out, String const& str)
1497 {
1498  // Note: under MS-Windows we'd need to use str.to_wchar() instead
1499  out << str.to_utf8();
1500  return out;
1501 }
1502 
1503 
1504 }
1505 // namespace as2js
1506 
1507 // vim: ts=4 sw=4 et
bool operator!=(char const *str) const
Compare this String against a char const * string.
Definition: string.cpp:942
int32_t as_char_t
Definition: string.h:47
bool valid() const
Check validity of the string.
Definition: string.cpp:986
String simplified() const
Make a simplified copy of this string.
Definition: string.cpp:1400
double float64_type
Definition: float64.h:48
ssize_t utf8_length() const
Calculate the length if converted to UTF-8.
Definition: string.cpp:1283
std::string to_utf8() const
Convert a string to UTF-8 and return the result.
Definition: string.cpp:1343
Float64::float64_type to_float64() const
Convert a string to a floating point number.
Definition: string.cpp:1230
String & operator+=(char const *str)
Append str to this String.
Definition: string.cpp:309
String()
Initialize an empty string.
Definition: string.cpp:60
bool is_int64() const
Check whether this string represents a valid integer.
Definition: string.cpp:1037
bool is_number() const
Check whether this string represents a number.
Definition: string.cpp:1155
conversion_result_t
Definition: string.h:60
String & operator=(char const *str)
Copy str in this String.
Definition: string.cpp:218
bool is_true() const
Check whether the string is considered true.
Definition: string.cpp:1253
conversion_result_t from_as_char(as_char_t const *str, int len=-1)
Copy an as_char_t string to this String.
Definition: string.cpp:733
Int64::int64_type to_int64() const
Convert a string to an integer number.
Definition: string.cpp:1181
conversion_result_t from_utf8(char const *str, int len=-1)
Copy a UTF-8 string to this String.
Definition: string.cpp:792
std::ostream & operator<<(std::ostream &out, Node const &node)
Send a node to the specified output stream.
bool operator==(char const *str) const
Compare this String against a char const * string.
Definition: string.cpp:907
conversion_result_t from_wchar(wchar_t const *str, int len=-1)
Copy a wchar_t string to this String.
Definition: string.cpp:638
The AlexScript to JavaScript namespace.
Definition: compiler.cpp:37
bool is_float64() const
Check whether the string represents a valid floating pointer number.
Definition: string.cpp:1105
conversion_result_t from_char(char const *str, int len=-1)
Concatenate a String and a C-string.
Definition: string.cpp:588
int64_t int64_type
Definition: int64.h:47
static bool valid_character(as_char_t c)
Check whether a character is considered valid.
Definition: string.cpp:1015

This document is part of the Snap! Websites Project.

Copyright by Made to Order Software Corp.

Syndicate content

Snap! Websites
An Open Source CMS System in C++

Contact Us Directly