as2js: /home/snapwebsites/snapcpp/contrib/as2js/lib/stream.cpp Source File

as2js  0.1.14
AlexScript to JavaScript
stream.cpp
Go to the documentation of this file.
1 /* lib/stream.cpp
2 
3 Copyright (c) 2005-2019 Made to Order Software Corp. All Rights Reserved
4 
6 
7 Permission is hereby granted, free of charge, to any
8 person obtaining a copy of this software and
9 associated documentation files (the "Software"), to
10 deal in the Software without restriction, including
11 without limitation the rights to use, copy, modify,
12 merge, publish, distribute, sublicense, and/or sell
13 copies of the Software, and to permit persons to whom
14 the Software is furnished to do so, subject to the
15 following conditions:
16 
17 The above copyright notice and this permission notice
18 shall be included in all copies or substantial
19 portions of the Software.
20 
21 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
22 ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT
23 LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
24 FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO
25 EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
26 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
27 WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
28 ARISING FROM, OUT OF OR IN CONNECTION WITH THE
29 SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 SOFTWARE.
31 
32 */
33 
34 #include "as2js/stream.h"
35 
36 #include "as2js/exceptions.h"
37 #include "as2js/message.h"
38 
39 #include <unistd.h>
40 
41 
42 namespace as2js
43 {
44 
45 /**********************************************************************/
46 /**********************************************************************/
47 /*** FILTERS ********************************************************/
48 /**********************************************************************/
49 /**********************************************************************/
50 
51 
58 {
59 }
60 
61 
71 void DecodingFilter::putc(unsigned char c)
72 {
73  f_buffer.push_back(c);
74 }
75 
76 
97 {
98  if(f_buffer.empty())
99  {
100  return Input::INPUT_EOF;
101  }
102 
103  return get_char();
104 }
105 
106 
119 {
120  // no conversion for ISO-8859-1 to UTF-32
121  Input::char_t c(f_buffer[0]);
122  f_buffer.erase(f_buffer.begin());
123  return c;
124 }
125 
126 
141 {
142  // Note: we know that the buffer is at least 1 byte
143  unsigned char b(f_buffer[0]);
144 
145  if(b < 0x80)
146  {
147  f_buffer.erase(f_buffer.begin());
148  return b;
149  }
150 
151  size_t l(0);
152  as_char_t c(0);
153  if(b >= 0xC0 && b <= 0xDF)
154  {
155  l = 2;
156  c = b & 0x1F;
157  }
158  else if(b >= 0xE0 && b <= 0xEF)
159  {
160  l = 3;
161  c = b & 0x0F;
162  }
163  else if(b >= 0xF0 && b <= 0xF7)
164  {
165  l = 4;
166  c = b & 0x07;
167  }
168  else
169  {
170  // invalid UTF-8 sequence, erase one input byte
171  f_buffer.erase(f_buffer.begin());
172  return Input::INPUT_ERR;
173  }
174  if(f_buffer.size() < l)
175  {
176  // not enough bytes for this character
177  return Input::INPUT_NAC;
178  }
179  for(size_t i(1); i < l; ++i)
180  {
181  b = f_buffer[i];
182  if(b < 0x80 || b > 0xBF)
183  {
184  // found an invalid byte, remove bytes before that
185  f_buffer.erase(f_buffer.begin(), f_buffer.begin() + i);
186  return Input::INPUT_ERR;
187  }
188  c = (c << 6) | (b & 0x3F);
189  }
190 
191  // get rid of those bytes
192  f_buffer.erase(f_buffer.begin(), f_buffer.begin() + l);
193 
194  // is it a UTF-16 surrogate or too large a character?
196  {
197  return Input::INPUT_ERR;
198  }
199 
200  // return that character
201  return c;
202 }
203 
204 
226 {
227  if(c >= 0xD800 && c < 0xDC00)
228  {
229  f_lead_surrogate = c;
230  return Input::INPUT_NAC; // not an error unless it was the last 2 bytes
231  }
232  else if(c >= 0xDC00 && c <= 0xDFFF)
233  {
234  if(f_lead_surrogate == 0)
235  {
236  // lead surrogate missing, skip trail
237  f_buffer.erase(f_buffer.begin(), f_buffer.begin() + 2);
238  return Input::INPUT_ERR;
239  }
240  c = (((static_cast<as_char_t>(f_lead_surrogate) & 0x03FF) << 10) | (static_cast<as_char_t>(c) & 0x03FF)) + 0x10000;
241  f_lead_surrogate = 0;
242  }
243  else if(f_lead_surrogate != 0)
244  {
245  // trail surrogate missing
246  f_lead_surrogate = 0;
247  return Input::INPUT_ERR;
248  }
249 
250  return c;
251 }
252 
253 
266 {
267  Input::char_t c;
268  do
269  {
270  if(f_buffer.size() < 2)
271  {
272  return Input::INPUT_NAC;
273  }
274 
275  c = next_char(f_buffer[0] + f_buffer[1] * 256);
276  if(c == Input::INPUT_ERR)
277  {
278  return Input::INPUT_ERR;
279  }
280  f_buffer.erase(f_buffer.begin(), f_buffer.begin() + 2);
281  }
282  while(c == Input::INPUT_NAC);
283 
284  return c;
285 }
286 
287 
300 {
301  Input::char_t c;
302  do
303  {
304  if(f_buffer.size() < 2)
305  {
306  return Input::INPUT_NAC;
307  }
308 
309  c = next_char(f_buffer[0] * 256 + f_buffer[1]);
310  if(c == Input::INPUT_ERR)
311  {
312  return Input::INPUT_ERR;
313  }
314  f_buffer.erase(f_buffer.begin(), f_buffer.begin() + 2);
315  }
316  while(c == Input::INPUT_NAC);
317 
318  return c;
319 }
320 
321 
335 {
336  if(f_buffer.size() < 4)
337  {
338  return Input::INPUT_NAC;
339  }
340 
341  // little endian has byte 0 as the least significant
342  Input::char_t c(
343  (f_buffer[0] << 0)
344  | (f_buffer[1] << 8)
345  | (f_buffer[2] << 16)
346  | (f_buffer[3] << 24)
347  );
348  f_buffer.erase(f_buffer.begin(), f_buffer.begin() + 4);
350  {
351  return Input::INPUT_ERR;
352  }
353  return c;
354 }
355 
356 
370 {
371  if(f_buffer.size() < 4)
372  {
373  return Input::INPUT_NAC;
374  }
375 
376  // big endian has byte 0 as the most significant
377  Input::char_t c(
378  (f_buffer[0] << 24)
379  | (f_buffer[1] << 16)
380  | (f_buffer[2] << 8)
381  | (f_buffer[3] << 0)
382  );
383  f_buffer.erase(f_buffer.begin(), f_buffer.begin() + 4);
385  {
386  return Input::INPUT_ERR;
387  }
388  return c;
389 }
390 
391 
413 {
414  if(!f_filter)
415  {
416  if(f_buffer.size() < 4)
417  {
418  return Input::INPUT_NAC;
419  }
420 
421  // read the BOM in big endian
422  uint32_t bom(
423  (f_buffer[0] << 24)
424  | (f_buffer[1] << 16)
425  | (f_buffer[2] << 8)
426  | (f_buffer[3] << 0)
427  );
428 
429  if(bom == 0x0000FEFF)
430  {
431  // UTF-32 Big Endian
432  f_filter.reset(new DecodingFilterUTF32BE);
433  f_buffer.erase(f_buffer.begin(), f_buffer.begin() + 4);
434  }
435  else if(bom == 0xFFFE0000)
436  {
437  // UTF-32 Little Endian
438  f_filter.reset(new DecodingFilterUTF32LE);
439  f_buffer.erase(f_buffer.begin(), f_buffer.begin() + 4);
440  }
441  else if((bom >> 16) == 0xFEFF)
442  {
443  // UTF-16 Big Endian
444  f_filter.reset(new DecodingFilterUTF16BE);
445  f_buffer.erase(f_buffer.begin(), f_buffer.begin() + 2);
446  }
447  else if((bom >> 16) == 0xFFFE)
448  {
449  // UTF-16 Little Endian
450  f_filter.reset(new DecodingFilterUTF16LE);
451  f_buffer.erase(f_buffer.begin(), f_buffer.begin() + 2);
452  }
453  else if((bom & 0xFFFFFF00) == 0xEFBBBF00)
454  {
455  // UTF-8
456  f_filter.reset(new DecodingFilterUTF8);
457  f_buffer.erase(f_buffer.begin(), f_buffer.begin() + 3);
458  }
459  else
460  {
461  // if each character is valid UTF-8, the use UTF-8
462  String s;
463  String::conversion_result_t r(s.from_utf8(reinterpret_cast<char const *>(&f_buffer[0]), f_buffer.size()));
465  {
466  f_filter.reset(new DecodingFilterUTF8);
467  }
468  else
469  {
470  // fallback to ISO-8859-1 (should very rarely happen!)
471  f_filter.reset(new DecodingFilterISO88591);
472  }
473  }
474  }
475 
476  // we do not get BOMs returned, yet we could check for the BOM
477  // character and adjust the filter if we detect it being
478  // swapped (it does not look like Unicode promotes that scheme
479  // anymore though, therefore at this point we won't do that...)
480 
481  Input::char_t c(f_filter->getc());
482  while((c == Input::INPUT_EOF || c == Input::INPUT_NAC)
483  && !f_buffer.empty())
484  {
485  // transmit the data added to "this" filter
486  // down to f_filter, but only as required
487  // because otherwise we'd generate an EOF
488  f_filter->putc(f_buffer[0]);
489  f_buffer.erase(f_buffer.begin(), f_buffer.begin() + 1);
490  c = f_filter->getc();
491  }
492 
493  return c;
494 }
495 
496 
497 
498 /**********************************************************************/
499 /**********************************************************************/
500 /*** INPUT **********************************************************/
501 /**********************************************************************/
502 /**********************************************************************/
503 
504 
518  : f_filter(filter)
519  //, f_position() -- auto-init
520  //, f_unget() -- auto-init
521 {
522 }
523 
524 
533 {
534  return f_position;
535 }
536 
537 
546 {
547  return f_position;
548 }
549 
550 
561 {
562  if(!f_unget.empty())
563  {
564  char_t result(f_unget.back());
565  f_unget.pop_back();
566  return result;
567  }
568  return filter_getc();
569 }
570 
571 
581 {
582  // silently avoid ungetting special values such as INPUT_EOF
583  // (TBD: maybe we should check surrogates?)
584  if(c > 0 && c < 0x110000)
585  {
586  f_unget.push_back(c);
587  }
588 }
589 
590 
610 {
611  // if the input class used does not overload this function,
612  // then we get the next byte and try to convert it to a
613  // character, if that works, return that character
614 
615  char_t w;
616  do
617  {
618  char_t c(get_byte());
619  if(c == Input::INPUT_EOF)
620  {
621  // determine the final result
622  w = f_filter->getc();
623  return w == Input::INPUT_NAC ? Input::INPUT_ERR : w;
624  }
625  f_filter->putc(c);
626  w = f_filter->getc();
627  }
628  while(w == Input::INPUT_NAC || w == Input::INPUT_EOF);
629  // EOF can happen if we bump in a BOM in the middle of nowhere
630  // so we have to loop on EOF as well
631 
632  return w;
633 }
634 
635 
653 {
654  // this function should never be called
655  throw exception_internal_error("internal error: the get_byte() of the Input class was called");
656 }
657 
658 
659 /**********************************************************************/
660 /**********************************************************************/
661 /*** STANDARD INPUT *************************************************/
662 /**********************************************************************/
663 /**********************************************************************/
664 
665 
678 {
679  get_position().set_filename("-");
680 }
681 
682 
694 {
695  char c;
696  if(std::cin.get(c))
697  {
698  return static_cast<char_t>(c) & 255;
699  }
700  return INPUT_EOF;
701 }
702 
703 
704 /**********************************************************************/
705 /**********************************************************************/
706 /*** FILE INPUT *****************************************************/
707 /**********************************************************************/
708 /**********************************************************************/
709 
710 
727 bool FileInput::open(String const& filename)
728 {
729  if(f_file.is_open())
730  {
731  throw exception_file_already_open("file object for \"" + get_position().get_filename().to_utf8() + "\" cannot be reused for \"" + filename.to_utf8() + "\"");
732  }
733 
734  std::string utf8(filename.to_utf8());
735  f_file.open(utf8.c_str());
736  if(!f_file.is_open())
737  {
738  return false;
739  }
740  get_position().set_filename(filename);
741 
742  return true;
743 }
744 
745 
753 {
754  char c;
755  if(f_file.get(c))
756  {
757  return static_cast<char_t>(c) & 255;
758  }
759  return INPUT_EOF;
760 }
761 
762 
763 
764 
765 /**********************************************************************/
766 /**********************************************************************/
767 /*** STRING INPUT ***************************************************/
768 /**********************************************************************/
769 /**********************************************************************/
770 
781  : f_str(str)
782  //, f_pos(0) -- auto-init
783 {
784  // in case line is not set to 1
786 }
787 
788 
797 {
798  char_t c(INPUT_EOF);
799 
800  if(f_pos < f_str.length())
801  {
802  c = f_str[f_pos];
803  ++f_pos;
804  }
805  return c;
806 }
807 
808 
809 
810 
811 /**********************************************************************/
812 /**********************************************************************/
813 /*** OUTPUT *********************************************************/
814 /**********************************************************************/
815 /**********************************************************************/
816 
817 
826 {
827  return f_position;
828 }
829 
830 
839 {
840  return f_position;
841 }
842 
843 
857 void Output::write(String const& data)
858 {
859  internal_write(data);
860 }
861 
862 
863 /**********************************************************************/
864 /**********************************************************************/
865 /*** STANDARD OUTPUT ************************************************/
866 /**********************************************************************/
867 /**********************************************************************/
868 
875 {
876  get_position().set_filename("-");
877 }
878 
879 
890 {
891  std::cout << data.to_utf8();
892  if(!std::cout)
893  {
894  // should we do something here?
896  msg << "I/O error: could not write to output.";
897  throw exception_exit(1, "I/O error: could not write to output.");
898  }
899 }
900 
901 
902 /**********************************************************************/
903 /**********************************************************************/
904 /*** OUTPUT FILE ****************************************************/
905 /**********************************************************************/
906 /**********************************************************************/
907 
908 
925 bool FileOutput::open(String const& filename)
926 {
927  if(f_file.is_open())
928  {
929  throw exception_file_already_open("file object for \"" + get_position().get_filename().to_utf8() + "\" cannot be reused for \"" + filename.to_utf8() + "\"");
930  }
931 
932  std::string utf8(filename.to_utf8());
933  f_file.open(utf8.c_str());
934  if(!f_file.is_open())
935  {
936  return false;
937  }
938  get_position().set_filename(filename);
939 
940  return true;
941 }
942 
943 
954 {
955  f_file << data.to_utf8();
956  if(!f_file)
957  {
958  // should we do something here?
960  msg << "I/O error: could not write to output.";
961  throw exception_exit(1, "I/O error: could not write to output.");
962  }
963 }
964 
965 
966 /**********************************************************************/
967 /**********************************************************************/
968 /*** OUTPUT STRING **************************************************/
969 /**********************************************************************/
970 /**********************************************************************/
971 
972 
981 {
982  return f_string;
983 }
984 
985 
993 {
994  f_string += data;
995 }
996 
997 
998 }
999 // namespace as2js
1000 
1001 // vim: ts=4 sw=4 et
virtual char_t get_byte()
Read one by from the standard input.
Definition: stream.cpp:693
virtual void internal_write(String const &data)
Write to the output file.
Definition: stream.cpp:953
bool open(String const &filename)
Open the output file.
Definition: stream.cpp:925
int32_t as_char_t
Definition: string.h:47
virtual char_t filter_getc()
Get the next character.
Definition: stream.cpp:796
std::shared_ptr< DecodingFilter > pointer_t
Definition: stream.h:51
std::string to_utf8() const
Convert a string to UTF-8 and return the result.
Definition: string.cpp:1343
void ungetc(char_t c)
Unget one character.
Definition: stream.cpp:580
virtual char_t filter_getc()
Get the next character.
Definition: stream.cpp:609
Input(DecodingFilter::pointer_t filter=DecodingFilter::pointer_t(new DecodingFilterDetect))
Initialize an input object.
Definition: stream.cpp:517
as_char_t next_char(as_char_t c)
Decode a UTF-16 character.
Definition: stream.cpp:225
static char_t const INPUT_ERR
Definition: stream.h:157
virtual char_t get_byte()
Get the next byte from the file.
Definition: stream.cpp:752
virtual as_char_t get_char()
Decode UTF-16 in Little Endian format.
Definition: stream.cpp:265
virtual as_char_t get_char()
Get the next ISO-8859-1 character.
Definition: stream.cpp:118
StandardOutput()
Initializes the standard output object.
Definition: stream.cpp:874
std::vector< char_t > f_unget
Definition: stream.h:175
StandardInput()
Use standard input as the input stream.
Definition: stream.cpp:677
virtual void internal_write(String const &data)
Write a string to standard output.
Definition: stream.cpp:889
conversion_result_t
Definition: string.h:60
void write(String const &data)
Write data to this output stream.
Definition: stream.cpp:857
void reset_counters(counter_t line=DEFAULT_COUNTER)
Reset the counter.
Definition: position.cpp:84
int32_t counter_t
Definition: position.h:47
as_char_t char_t
Definition: stream.h:153
virtual as_char_t get_char()
Get the next UTF-8 character.
Definition: stream.cpp:140
as_char_t getc()
Retrieve the next character.
Definition: stream.cpp:96
bool open(String const &filename)
Use the named file as the input stream.
Definition: stream.cpp:727
virtual ~DecodingFilter()
Destructor.
Definition: stream.cpp:57
conversion_result_t from_utf8(char const *str, int len=-1)
Copy a UTF-8 string to this String.
Definition: string.cpp:792
virtual as_char_t get_char()=0
virtual char_t get_byte()
Function used to get the following byte of data.
Definition: stream.cpp:652
static char_t const INPUT_EOF
Definition: stream.h:155
virtual as_char_t get_char()
Read the next character in any format.
Definition: stream.cpp:412
Position & get_position()
Get the position object of the input object.
Definition: stream.cpp:532
byte_vector_t f_buffer
Definition: stream.h:63
Position & get_position()
Get the position object of the input object.
Definition: stream.cpp:825
virtual as_char_t get_char()
Decode UTF-32 in Little Endian format.
Definition: stream.cpp:334
String::size_type f_pos
Definition: stream.h:229
DecodingFilter::pointer_t f_filter
Definition: stream.h:173
The AlexScript to JavaScript namespace.
Definition: compiler.cpp:37
virtual as_char_t get_char()
Decode UTF-32 in Big Endian format.
Definition: stream.cpp:369
String const & get_string() const
Retrieve a copy of the output string.
Definition: stream.cpp:980
char_t getc()
Get one character.
Definition: stream.cpp:560
virtual as_char_t get_char()
Decode UTF-16 in Big Endian format.
Definition: stream.cpp:299
void set_filename(String const &filename)
Set the filename being read.
Definition: position.cpp:52
Position f_position
Definition: stream.h:174
static char_t const INPUT_NAC
Definition: stream.h:156
virtual void internal_write(String const &data)
Write to the output string.
Definition: stream.cpp:992
StringInput(String const &str, Position::counter_t line=1)
Initliaze the string input.
Definition: stream.cpp:780
void putc(byte_t c)
Push one byte in the decoder.
Definition: stream.cpp:71
static bool valid_character(as_char_t c)
Check whether a character is considered valid.
Definition: string.cpp:1015

This document is part of the Snap! Websites Project.

Copyright by Made to Order Software Corp.

Syndicate content

Snap! Websites
An Open Source CMS System in C++

Contact Us Directly