stlab.adobe.com Adobe Systems Incorporated
xml_parser.hpp
Go to the documentation of this file.
1 /*
2  Copyright 2005-2007 Adobe Systems Incorporated
3  Distributed under the MIT License (see accompanying file LICENSE_1_0_0.txt
4  or a copy at http://stlab.adobe.com/licenses.html)
5 */
6 
7 /*************************************************************************************************/
8 
9 #ifndef ADOBE_XML_PARSER_HPP
10 #define ADOBE_XML_PARSER_HPP
11 
12 /*************************************************************************************************/
13 
14 #include <adobe/config.hpp>
15 
16 #include <adobe/any_regular.hpp>
17 #include <adobe/algorithm/set.hpp>
18 #include <adobe/istream.hpp>
19 #include <adobe/array.hpp>
20 #include <adobe/copy_on_write.hpp>
21 #include <adobe/name.hpp>
22 #include <adobe/dictionary.hpp>
23 #include <adobe/string.hpp>
24 #include <adobe/implementation/xml_lex.hpp>
25 #include <adobe/implementation/xml_token.hpp>
26 #include <adobe/implementation/parser_shared.hpp>
27 
28 #include <boost/function.hpp>
29 #include <boost/noncopyable.hpp>
30 #include <boost/operators.hpp>
31 #include <boost/bind.hpp>
32 #include <boost/array.hpp>
33 #include <boost/iterator/iterator_facade.hpp>
34 
35 #include <utility>
36 #include <istream>
37 #include <sstream>
38 #include <iomanip>
39 #include <cassert>
40 #include <list>
41 
42 /*************************************************************************************************/
43 
44 namespace adobe {
45 
46 /*************************************************************************************************/
47 
48 // NOTE (fbrereto) : Class declaration for the documentation is in xml_parser.dox
49 struct attribute_set_t : public boost::equality_comparable<attribute_set_t>
50 {
53  typedef std::pair<key_type, mapped_type> value_type;
54  typedef std::vector<value_type> set_type;
55  typedef set_type::size_type size_type;
56  typedef set_type::const_iterator const_iterator;
57  typedef const_iterator iterator;
58 
65  struct less_t : std::binary_function<value_type, value_type, bool>
66  {
67  bool operator () (const value_type& x, const value_type& y) const
68  {
69  return token_range_less(x.first, y.first) ||
70  (!token_range_less(y.first, x.first) &&
71  token_range_less(x.second, y.second));
72  }
73  };
74 
80  struct less_key_only_t : std::binary_function<value_type, value_type, bool>
81  {
82  bool operator () (const value_type& x, const value_type& y) const
83  {
84  return token_range_less(x.first, y.first);
85  }
86  };
87 
99  bool lower_bound(const value_type& attribute, set_type::iterator& result)
100  {
101  result = adobe::lower_bound(set_m.write(), attribute, less_key_only_t());
102 
103  return result != set_m.write().end() &&
104  token_range_equal(result->first, attribute.first);
105  }
106 
118  bool lower_bound(const key_type& key, set_type::iterator& result)
119  { return lower_bound(value_type(key, mapped_type()), result); }
120 
124  bool lower_bound(const value_type& attribute, set_type::const_iterator& result) const
125  {
126  result = adobe::lower_bound(*set_m, attribute, less_key_only_t());
127 
128  return result != set_m->end() &&
129  token_range_equal(result->first, attribute.first);
130  }
131 
135  bool lower_bound(const key_type& key, set_type::const_iterator& result) const
136  { return lower_bound(value_type(key, mapped_type()), result); }
137 
147  mapped_type operator [] (const key_type& key) const
148  {
149  set_type::const_iterator result;
150 
151  if (lower_bound(key, result))
152  return result->second;
153 
154  return mapped_type();
155  }
156 
171  attribute_set_t merge(const attribute_set_t& other_set) const
172  {
173 
174  attribute_set_t merged;
175 
176  adobe::set_union(*set_m, *other_set.set_m, std::back_inserter(merged.set_m.write()), less_key_only_t());
177 
178  return merged;
179  }
180 
192  void insert(const value_type& attribute)
193  {
194  set_type::iterator result;
195 
196  if (lower_bound(attribute, result))
197  result->second = attribute.second;
198  else
199  set_m.write().insert(result, attribute);
200  }
201 
212  template <typename I> // I models InputIterator
213  inline void insert(I first, I last)
214  { for (; first != last; ++first) insert(*first); }
215 
224  inline void insert(const key_type& key, const mapped_type& value)
225  { insert(value_type(key, value)); }
226 
237  std::size_t count_same(const attribute_set_t& other_set, bool mapped_matters = true) const;
238 
252  bool has_collisions(const attribute_set_t& other_set) const;
253 
263  std::size_t count_collisions(const attribute_set_t& other_set) const;
264 
268  inline bool empty() const
269  { return set_m->empty(); }
270 
275  inline size_type size() const
276  { return set_m->size(); }
277 
282  const_iterator begin() const { return set_m->begin(); }
283 
288  const_iterator end() const { return set_m->end(); }
289 
296  void clear() { set_m.write().clear(); }
297 
298 private:
299  friend bool operator == (const attribute_set_t& x, const attribute_set_t& y);
300  friend std::ostream& operator << (std::ostream& s, const attribute_set_t& attribute_set);
301 
303 };
304 
305 /*************************************************************************************************/
306 
319 inline bool operator == (const attribute_set_t& x, const attribute_set_t& y)
320 {
321  return x.set_m->size() == y.set_m->size() && x.count_same(y) == x.set_m->size();
322 }
323 
324 /*************************************************************************************************/
325 
337 inline std::ostream& operator << (std::ostream& s, const attribute_set_t& attribute_set)
338 {
339  attribute_set_t::set_type::const_iterator first(attribute_set.set_m->begin());
340  attribute_set_t::set_type::const_iterator last(attribute_set.set_m->end());
341  bool not_first(false);
342 
343  for (; first != last; ++first)
344  {
345  if (not_first)
346  s << " ";
347  else
348  not_first = true;
349 
350  adobe::copy(first->first, std::ostream_iterator<char>(s));
351 
352  s << "='";
353 
354  adobe::copy(first->second, std::ostream_iterator<char>(s));
355 
356  s << "'";
357  }
358 
359  return s;
360 }
361 
362 /*************************************************************************************************/
363 
364 inline std::size_t attribute_set_t::count_same(const attribute_set_t& other_set, bool mapped_matters) const
365 {
366  std::size_t result(0);
367 
368  if (mapped_matters)
369  result = adobe::set_intersection( *set_m, *other_set.set_m,
371  less_t())
372  .count();
373  else
374  result = adobe::set_intersection( *set_m, *other_set.set_m,
376  less_key_only_t())
377  .count();
378 
379  #if 0
380  std::cerr << " count_same:\n"
381  << " orig: " << *this << "\n"
382  << " test: " << other_set << "\n"
383  << " result: " << result << std::endl;
384  #endif
385 
386  return result;
387 }
388 
389 /*************************************************************************************************/
390 
391 inline bool attribute_set_t::has_collisions(const attribute_set_t& other_set) const
392 {
393  attribute_set_t::set_type::const_iterator first(set_m->begin());
394  attribute_set_t::set_type::const_iterator last(set_m->end());
395 
396  for (; first != last; ++first)
397  {
398  set_type::const_iterator result;
399 
400  if (other_set.lower_bound(*first, result) && !token_range_equal(result->second, first->second))
401  return true;
402  }
403 
404  return false;
405 }
406 
407 /*************************************************************************************************/
408 
409 inline std::size_t attribute_set_t::count_collisions(const attribute_set_t& other_set) const
410 {
411  attribute_set_t::set_type::const_iterator first(set_m->begin());
412  attribute_set_t::set_type::const_iterator last(set_m->end());
413  std::size_t collision_count(0);
414 
415  for (; first != last; ++first)
416  {
417  set_type::const_iterator result;
418 
419  if (other_set.lower_bound(*first, result) && result->second != first->second)
420  ++collision_count;
421  }
422 
423  return collision_count;
424 }
425 
426 /*************************************************************************************************/
427 
428 // REVISIT (sparent) : Extra typedef just for the doxygen tool.
429 
431  const token_range_t& entire_element_range,
432  const token_range_t& name,
433  const attribute_set_t& attribute_set,
434  const token_range_t& value);
435 
437 
438 /*************************************************************************************************/
439 
440 // NOTE (fbrereto) : Class declaration for the documentation is in xml_parser.dox
441 template <typename O> // O models OutputIterator
442 class xml_parser_t : public boost::noncopyable
443 {
444 public:
445  typedef xml_element_proc_t callback_proc_t;
447  typedef xml_lex_t::token_type token_type;
448 
450  uchar_ptr_t last,
451  const line_position_t& position,
452  preorder_predicate_t predicate,
453  callback_proc_t callback,
454  O output) :
455  pred_m(predicate),
456  callback_m(callback),
457  output_m(output),
458  token_stream_m(first, last, position),
459  preorder_mode_m(false)
460  { }
461 
463  pred_m(rhs.pred_m),
464  callback_m(rhs.callback_m),
465  output_m(rhs.output_m),
466  token_stream_m(rhs.token_stream_m),
467  preorder_mode_m(rhs.preorder_mode_m)
468  { }
469 
470  xml_parser_t& operator = (const xml_parser_t& rhs)
471  {
472  pred_m = rhs.pred_m;
473  callback_m = rhs.callback_m;
474  output_m = rhs.output_m;
475  token_stream_m = rhs.token_stream_m;
476  preorder_mode_m = rhs.preorder_mode_m;
477 
478  return *this;
479  }
480 
481  virtual ~xml_parser_t()
482  { }
483 
485  { return token_stream_m.next_position(); }
486 
492  void set_preorder_predicate(preorder_predicate_t pred)
493  { pred_m = pred; }
494 
518  void parse_element_sequence();
519 
565  void parse_content();
566 
570  void parse_document();
571 
572 /*
573  REVISIT (sparent) : We should provide a protected call to get the token stream and allow
574  subclasses to access it directly - but for now we'll stick with the law of Demiter.
575 */
576 
577 protected:
578  const token_type& get_token()
579  { return token_stream_m.get(); }
580  void putback()
581  { token_stream_m.putback(); }
582 
583  bool is_token(xml_lex_token_set_t name, token_range_t& value);
584  bool is_token(xml_lex_token_set_t name);
585  void require_token(xml_lex_token_set_t name, token_range_t& value);
586  void require_token(xml_lex_token_set_t name);
587 
588  /* REVISIT (sparent) : Should these be const? And is there a way to specify the class to throw? */
589 
590  void throw_exception(const char* error_string)
591  { throw_parser_exception(error_string, next_position()); }
592  void throw_exception(xml_lex_token_set_t found, xml_lex_token_set_t expected)
593  { throw_parser_exception(token_to_string(found), token_to_string(expected), next_position()); }
594 
595  bool is_element(token_range_t& element);
596  bool is_content(token_range_t& element);
597  bool is_e_tag(token_range_t& name, token_range_t& close_tag);
598  bool is_attribute_set(attribute_set_t& attribute_set);
599  bool is_attribute(token_range_t& name, token_range_t& value);
600  bool is_prolog();
601  bool is_bom(token_range_t& bom);
602  bool is_xml_decl(token_range_t& xml_decl);
603 
604  void content_callback( token_range_t& result_element,
605  const token_range_t& old_element,
606  const token_range_t& start_tag,
607  const attribute_set_t attribute_set,
608  const token_range_t& content,
609  bool preorder_parent);
610 
611  preorder_predicate_t pred_m;
612  callback_proc_t callback_m;
614 
615 private:
616  xml_lex_t token_stream_m;
617  bool preorder_mode_m;
618 };
619 
620 /*************************************************************************************************/
621 
622 inline token_range_t xml_element_echo( const token_range_t& entire_element_range,
623  const token_range_t& /*name*/,
624  const attribute_set_t& /*attribute_set*/,
625  const token_range_t& /*value*/)
626  { return entire_element_range; }
627 
628 /*************************************************************************************************/
629 
630 inline token_range_t xml_element_strip( const token_range_t& /*entire_element_range*/,
631  const token_range_t& /*name*/,
632  const attribute_set_t& /*attribute_set*/,
633  const token_range_t& value)
634  { return value; }
635 
636 /*************************************************************************************************/
637 
638 inline token_range_t xml_element_linefeed( const token_range_t& /*entire_element_range*/,
639  const token_range_t& name,
640  const attribute_set_t& attribute_set,
641  const token_range_t& value)
642 {
643  if (token_range_equal(name, static_token_range("br")) &&
644  attribute_set.empty() &&
645  boost::size(value) == 0)
646  {
647 #if ADOBE_PLATFORM_WIN
648  return static_token_range("&cr;&lf;");
649 #elif ADOBE_PLATFORM_MAC
650  return static_token_range("&cr;");
651 #elif ADOBE_PLATFORM_UNIX || ADOBE_PLATFORM_LINUX || ADOBE_PLATFORM_BSD || ADOBE_PLATFORM_SOLARIS ||\
652  ADOBE_PLATFORM_IRIX || ADOBE_PLATFORM_HPUX || ADOBE_PLATFORM_CYGWIN || ADOBE_PLATFORM_AIX
653  return static_token_range("&lf;");
654 #else
655  #error "Line ending for platform unknown - please configure and report the results to stlab.adobe.com"
656 #endif
657  }
658 
659  return value;
660 }
661 
662 /*************************************************************************************************/
663 
664 namespace implementation {
665 
666 /*************************************************************************************************/
667 
668 token_range_t transform_reference(const token_range_t& reference);
669 
670 /*************************************************************************************************/
671 
672 } // namespace implementation
673 
674 /*************************************************************************************************/
675 
676 template <typename O> // O models OutputIterator
677 bool xml_parser_t<O>::is_token(xml_lex_token_set_t token_name, token_range_t& token_range)
678 {
679  const token_type& result(get_token());
680 
681  if (result.enum_m == token_name)
682  {
683  token_range = result.range_m;
684 
685  return true;
686  }
687 
688  putback();
689 
690  return false;
691 }
692 
693 /*************************************************************************************************/
694 
695 template <typename O> // O models OutputIterator
696 bool xml_parser_t<O>::is_token(xml_lex_token_set_t token_name)
697 {
698  const token_type& result(get_token());
699 
700  if (result.enum_m == token_name)
701  return true;
702 
703  putback();
704 
705  return false;
706 }
707 
708 /*************************************************************************************************/
709 
710 template <typename O> // O models OutputIterator
711 void xml_parser_t<O>::require_token(xml_lex_token_set_t token_name, token_range_t& token_range)
712 {
713  const token_type& result(get_token());
714 
715  if (result.enum_m != token_name)
716  throw_exception(result.enum_m, token_name);
717 
718  token_range = result.range_m;
719 }
720 
721 /*************************************************************************************************/
722 
723 template <typename O> // O models OutputIterator
724 void xml_parser_t<O>::require_token(xml_lex_token_set_t token_name)
725 {
726  const token_type& result(get_token());
727 
728  if (result.enum_m != token_name)
729  throw_exception(result.enum_m, token_name);
730 }
731 
732 /*************************************************************************************************/
733 
734 template <typename O> // O models OutputIterator
736  const token_range_t& old_element,
737  const token_range_t& start_tag,
738  const attribute_set_t attribute_set,
739  const token_range_t& content,
740  bool preorder_parent)
741 {
742  if (preorder_parent)
743  {
744  // if we are in preorder mode and we are the preorder_parent,
745  // we send the content to the client callback function.
746  // We get back a single token_range, which we then parse all
747  // over again in a content parser all its own.
748 
749  token_range_t new_content(callback_m(old_element, start_tag, attribute_set, content));
750 
751  if (old_element == new_content)
752  {
753  // In the case when the new content is the same as the old element,
754  // the user has opted to echo the element to the output unchanged.
755 
756  adobe::copy(old_element, output_m);
757  }
758  else
759  {
760  // otherwise we need to parse the new content before we can move on to
761  // the rest of the parse. The new parser has the same predicate and
762  // output iterator as this one
763 
764  xml_parser_t<O>( new_content.first, new_content.second,
765  next_position(), pred_m, callback_m, output_m).parse_content();
766  }
767 
768  // once the token_range from the client has been parsed, we can turn off
769  // preorder mode and resume parsing the original token stream from where we
770  // left off.
771 
772  preorder_mode_m = false; // only the preorder_parent can turn off preorder mode
773  }
774  else
775  {
776  // in the case we are in preorder mode but we are not the initiator of
777  // the mode, we are within the context of another preorder parse. In
778  // this case we use the entire contents of the element as the token range
779  // and hand it back as the return value of this function.
780 
781  result_element = old_element;
782  }
783 }
784 
785 /*************************************************************************************************/
786 
787 template <typename O> // O models OutputIterator
789 {
790  element = token_range_t();
791 
792  attribute_set_t attribute_set;
793 
794  token_range_t open_tag;
795  token_range_t close_tag;
796 
797  if (!is_token(xml_token_open_tag_k, open_tag)) return false;
798 
799  token_range_t start_tag;
800  token_range_t end_tag;
801 
802  require_token(xml_token_name_k, start_tag);
803 
804  bool preorder_parent(false); // explained below
805 
806  // Preorder mode is a state for the entire parser. In this state the
807  // client processing callback is never called until the end of the
808  // current element is found. This precludes the processing of elements
809  // and other entities nested within this element from being handled until
810  // this containing element is processed. This is useful in the case when
811  // the content of the element could potentially be replaced, in which
812  // case processing the nested elements first would be a moot point.
813 
814  if (!preorder_mode_m && pred_m)
815  {
816  // preorder mode is only set when the predicate is defined and
817  // returns true for the start_tag of this element.
818 
819  preorder_mode_m = pred_m(start_tag);
820 
821 
822  // preorder_parent is used to denote which frame in the stack began
823  // the preorder traversal, as it is this frame alone that can turn
824  // it back off again.
825 
826  preorder_parent = preorder_mode_m;
827  }
828 
829  is_attribute_set(attribute_set);
830 
831  if (is_token(xml_token_slash_close_tag_k, close_tag))
832  {
833  if (preorder_mode_m)
834  {
835  content_callback( element,
836  token_range_t(open_tag.first, close_tag.second),
837  start_tag,
838  attribute_set,
839  token_range_t(),
840  preorder_parent);
841  }
842  else
843  {
844  // in the case when we are not in preorder mode at all, we pass the element
845  // to the client callback and output the token_range we receive back.
846 
847  token_range_t result(callback_m( token_range_t(open_tag.first, close_tag.second),
848  start_tag,
849  attribute_set,
850  token_range_t()));
851 
852  adobe::copy(result, output_m);
853  }
854 
855  return true;
856  }
857 
858  token_range_t close_of_open_tag;
859 
860  require_token(xml_token_close_tag_k, close_of_open_tag);
861 
862  token_range_t content;
863 
864  // In the case of inorder parsing we want to output the tags
865  // as we see them; in this case we need to output the opening
866  // tag before we can go on to the content parsing.
867 
868  if (!preorder_mode_m)
869  std::copy(open_tag.first, close_of_open_tag.second, output_m);
870 
871  if (!is_content(content))
872  throw std::runtime_error("Content expected but not found.");
873 
874  if (!is_e_tag(end_tag, close_tag))
875  throw std::runtime_error("End tag expected but not found.");
876 
877  if (!token_range_equal(start_tag, end_tag))
878  throw std::runtime_error("Start tag and end tag do not have the same name.");
879 
880  if (!preorder_mode_m)
881  {
882  // in the case when we are not in preorder mode
883  // we output the content we have immediately,
884  // then we need to output the closing tag before
885  // we can go on to the rest of the parse.
886 
887  adobe::copy(content, output_m);
888  adobe::copy(token_range_t(end_tag.first - 2, end_tag.second + 1), output_m);
889  }
890  else
891  {
892  // In this instance we are continuing a preorder parse...
893 
894  content_callback( element,
895  token_range_t(open_tag.first, close_tag.second),
896  start_tag,
897  attribute_set,
898  content,
899  preorder_parent);
900  }
901 
902  return true;
903 }
904 
905 /*************************************************************************************************/
906 
907 template <typename O> // O models OutputIterator
909 {
910  content = token_range_t();
911 
912  token_range_t char_data;
913 
914  // NOTE (fbrereto) : The content parser can never initiate a preorder mode.
915  // It can only be initiated by the parsing of a preorder
916  // element, which isn't handled here. So for the content
917  // parse we are either in preorder mode or not; we need
918  // not worry about managing it.
919 
920  if (is_token(xml_token_char_data_k, char_data))
921  {
922  // in the case when we are in preorder mode, we are part of a nested
923  // content, and we want to use this beginning char_data token as the
924  // start of the overall content token_range.
925 
926  if (preorder_mode_m)
927  { content = char_data; }
928 
929  // in the case when we are not in preorder mode this range of char_data
930  // needs to be sent directly to the output.
931 
932  else
933  { adobe::copy(char_data, output_m); }
934  }
935 
936  while (true)
937  {
938  token_range_t result;
939 
940  if (is_token(xml_token_reference_k, result))
941  {
942  if (boost::size(result))
943  {
944  if (preorder_mode_m)
945  {
946  // Again, if we're in preorder mode we're not outputting
947  // but extending (possibly even starting, too) the token_range
948  // for the preorder element.
949 
950  if (!content.first) content.first = result.first;
951 
952  content.second = result.second;
953  }
954  else
955  {
956  // if we're not in preorder mode, we pass the element's
957  // reference-transformed token_range result directly to
958  // the output.
959 
960  adobe::copy(implementation::transform_reference(result), output_m);
961  }
962  }
963  }
964  else if (is_element(result))
965  {
966  if (boost::size(result))
967  {
968  if (preorder_mode_m)
969  {
970  // Again, if we're in preorder mode we're not outputting
971  // but extending (possibly even starting, too) the token_range
972  // for the preorder element.
973 
974  if (!content.first) content.first = result.first;
975 
976  content.second = result.second;
977  }
978  else
979  {
980  // if we're not in preorder mode, we pass the element's
981  // token_range result directly to the output.
982 
983  adobe::copy(result, output_m);
984  }
985  }
986  }
987  else if (is_token(xml_token_comment_k, result))
988  {
989  // Comments are not parsed by any client functions.
990  // They are merely ignored by the parser.
991 
992  // REVISIT eberdahl - Because some clients may want to
993  // handle comments, we may want to extend the client
994  // callback system to permit a comment callback.
995  }
996  else
997  { break; }
998 
999  if (is_token(xml_token_char_data_k, char_data))
1000  {
1001  // if we find more char_data at the end of the content, we
1002  // either extent the preorder content data or we output
1003  // the contents of the char_data directly to the output (in
1004  // fullorder mode).
1005 
1006  if (preorder_mode_m)
1007  { content.second = char_data.second; }
1008  else
1009  { adobe::copy(char_data, output_m); }
1010  }
1011  }
1012 
1013  return true;
1014 }
1015 
1016 /*************************************************************************************************/
1017 
1018 template <typename O> // O models OutputIterator
1020 {
1021  if (!is_token(xml_token_open_slash_tag_k)) return false;
1022 
1023  require_token(xml_token_name_k, name);
1024 
1025  require_token(xml_token_close_tag_k, close_tag);
1026 
1027  return true;
1028 }
1029 
1030 /*************************************************************************************************/
1031 
1032 template <typename O> // O models OutputIterator
1034 {
1035  token_range_t att_name;
1036  token_range_t att_value;
1037 
1038  while (is_attribute(att_name, att_value))
1039  attribute_set.insert(att_name, att_value);
1040 
1041  return true;
1042 }
1043 
1044 /*************************************************************************************************/
1045 
1046 template <typename O> // O models OutputIterator
1048 {
1049  token_range_t bom;
1050  token_range_t xml_decl;
1051 
1052  if (is_bom(bom))
1053  {
1054  // REVISIT eberdahl 2006 Jun 18 - sanity check the bom
1055  }
1056 
1057  if (is_xml_decl(xml_decl))
1058  {
1059  // REVISIT eberdahl 2006 Jun 18 - sanity check the encoding
1060  // of the XMLDecl
1061 
1062  return true;
1063  }
1064 
1065  return false;
1066 }
1067 
1068 /*************************************************************************************************/
1069 
1070 template <typename O> // O models OutputIterator
1072 {
1073  const token_range_t utf8_bom = static_token_range("\xEF\xBB\xBF");
1074  const token_range_t utf16_be_bom = static_token_range("\xFE\xFF");
1075  const token_range_t utf16_le_bom = static_token_range("\xFF\xFE");
1076 
1077  bool result = false;
1078 
1079  // whitespace skipping should be off when sniffing for a bom
1080  token_stream_m.set_skip_white_space(false);
1081 
1082  if (is_token(xml_token_char_data_k, bom))
1083  {
1084  if (boost::size(utf8_bom) <= boost::size(bom) &&
1085  adobe::equal(utf8_bom, bom.first))
1086  {
1087  bom.second = bom.first;
1088  std::advance(bom.second, boost::size(utf8_bom));
1089 
1090  result = true;
1091  }
1092  else if (boost::size(utf16_be_bom) <= boost::size(bom) &&
1093  adobe::equal(utf16_be_bom, bom.first))
1094  {
1095  // it's a bom, but it's not a format the parser supports
1096  throw_exception("utf16be bom encountered; xml_parser_t only supports utf8 encoding");
1097  }
1098  else if (boost::size(utf16_le_bom) <= boost::size(bom) &&
1099  adobe::equal(utf16_le_bom, bom.first))
1100  {
1101  // it's a bom, but it's not a format the parser supports
1102  throw_exception("utf16le bom encountered; xml_parser_t only supports utf8 encoding");
1103  }
1104  }
1105 
1106  token_stream_m.set_skip_white_space(true);
1107 
1108  return result;
1109 }
1110 
1111 /*************************************************************************************************/
1112 
1113 template <typename O> // O models OutputIterator
1115 {
1116  if (is_token(xml_token_processing_instruction_k, xml_decl))
1117  {
1118  // REVISIT eberdahl 2006 Jun 18 - sanity check that the PI
1119  // encountered is, in fact, targeted at the xml application
1120 
1121  return true;
1122  }
1123 
1124  return false;
1125 }
1126 
1127 /*************************************************************************************************/
1128 
1129 template <typename O> // O models OutputIterator
1131 {
1132  if (is_token(xml_token_name_k, name))
1133  {
1134  require_token(xml_token_equals_k);
1135 
1136  require_token(xml_token_att_value_k, value);
1137 
1138  return true;
1139  }
1140 
1141  return false;
1142 }
1143 
1144 /*************************************************************************************************/
1145 
1146 template <typename O> // O models OutputIterator
1148 {
1149  assert(callback_m);
1150 
1151  token_range_t dummy;
1152 
1153  token_stream_m.set_skip_white_space(false);
1154 
1155  while (is_element(dummy))
1156  is_token(xml_token_char_data_k);
1157 }
1158 
1159 /*************************************************************************************************/
1160 
1161 template <typename O> // O models OutputIterator
1163 {
1164  token_range_t content;
1165 
1166  token_stream_m.set_skip_white_space(false);
1167 
1168  while (true)
1169  {
1170  // always returns true; have to test results
1171  is_content(content);
1172 
1173  if (boost::size(content))
1174  {
1175  token_range_t result(this->callback_m( content,
1176  token_range_t(),
1177  attribute_set_t(),
1178  content));
1179 
1180  adobe::copy(result, this->output_m);
1181  }
1182  else
1183  break;
1184  }
1185 }
1186 
1187 /*************************************************************************************************/
1188 
1189 template <typename O> // O models OutputIterator
1191 {
1192  token_range_t dummy;
1193 
1194  token_stream_m.set_skip_white_space(true);
1195 
1196  is_prolog();
1197  is_element(dummy);
1198 }
1199 
1200 /*************************************************************************************************/
1201 
1221 template <typename O> // O models OutputIterator
1223  uchar_ptr_t last,
1224  const line_position_t& position,
1225  typename xml_parser_t<O>::preorder_predicate_t predicate,
1227  O output)
1228 { return xml_parser_t<O>(first, last, position, predicate, callback, output); }
1229 
1230 /*************************************************************************************************/
1236 template <typename Result, typename InputIterator>
1237 InputIterator xatoi(InputIterator first, InputIterator last, Result& result)
1238 {
1239  result = 0;
1240 
1241  while (first != last && std::isxdigit(*first))
1242  {
1243  typename std::iterator_traits<InputIterator>::value_type c(*first);
1244 
1245  result <<= 4;
1246 
1247  if (std::isdigit(c))
1248  {
1249  result += c - '0';
1250  }
1251  else
1252  {
1253  c = std::use_facet<std::ctype<char> >(std::locale()).tolower(c);
1254 
1255  result += c - 'a' + 10;
1256  }
1257 
1258  ++first;
1259  }
1260 
1261  return first;
1262 }
1263 
1264 /*************************************************************************************************/
1270 template <typename Result, typename InputIterator>
1271 InputIterator datoi(InputIterator first, InputIterator last, Result& result)
1272 {
1273  result = 0;
1274 
1275  while (first != last && std::isdigit(*first))
1276  {
1277  result *= 10;
1278 
1279  result += *first - '0';
1280 
1281  ++first;
1282  }
1283 
1284  return first;
1285 }
1286 
1287 /*************************************************************************************************/
1288 
1289 } // namespace adobe
1290 
1291 /*************************************************************************************************/
1292 
1293 #endif
1294 
1295 /*************************************************************************************************/
bool is_token(xml_lex_token_set_t name, token_range_t &value)
Definition: xml_parser.hpp:677
bool is_e_tag(token_range_t &name, token_range_t &close_tag)
bool token_range_less(const token_range_t &x, const token_range_t &y)
bool operator()(const value_type &x, const value_type &y) const
Definition: xml_parser.hpp:67
friend std::ostream & operator<<(std::ostream &s, const attribute_set_t &attribute_set)
attribute_set_t merge(const attribute_set_t &other_set) const
Definition: xml_parser.hpp:171
xml_lex_t::token_type token_type
Definition: xml_parser.hpp:447
bool equal(InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, BinaryPredicate pred)
Definition: equal.hpp:38
set_type::size_type size_type
Definition: xml_parser.hpp:55
A type detailing parser position information.
Definition: istream.hpp:153
OutputIterator set_intersection(const InputRange1 &range1, const InputRange2 &range2, OutputIterator result)
set implementation
Definition: set.hpp:121
std::vector< value_type > set_type
Definition: xml_parser.hpp:54
InputIterator xatoi(InputIterator first, InputIterator last, Result &result)
token_range_t xml_element_linefeed(const token_range_t &, const token_range_t &name, const attribute_set_t &attribute_set, const token_range_t &value)
Definition: xml_parser.hpp:638
void callback(std::ios_base::event ev, std::ios_base &strm, int idx)
Definition: iomanip.hpp:315
void insert(I first, I last)
Definition: xml_parser.hpp:213
friend bool operator==(const attribute_set_t &x, const attribute_set_t &y)
token_range_t static_token_range(T *begin)
I lower_bound(I f, I l, const T &x)
std::pair< uchar_ptr_t, uchar_ptr_t > token_range_t
A range of pointers denoting a token within a character stream.
const_iterator iterator
Definition: xml_parser.hpp:57
const_iterator end() const
Definition: xml_parser.hpp:288
virtual ~xml_parser_t()
Definition: xml_parser.hpp:481
preorder_predicate_t pred_m
Definition: xml_parser.hpp:611
size_type size() const
Definition: xml_parser.hpp:275
xml_parser_t(uchar_ptr_t first, uchar_ptr_t last, const line_position_t &position, preorder_predicate_t predicate, callback_proc_t callback, O output)
Definition: xml_parser.hpp:449
token_range_t key_type
Definition: xml_parser.hpp:51
std::pair< key_type, mapped_type > value_type
Definition: xml_parser.hpp:53
bool token_range_equal(const token_range_t &x, const token_range_t &y)
OutputIterator copy(const InputRange &range, OutputIterator result)
copy implementation
Definition: copy.hpp:43
token_range_t() implementation_xml_element_proc_t(const token_range_t &entire_element_range, const token_range_t &name, const attribute_set_t &attribute_set, const token_range_t &value)
Definition: xml_parser.hpp:430
bool is_element(token_range_t &element)
Definition: xml_parser.hpp:788
const_iterator begin() const
Definition: xml_parser.hpp:282
void insert(const value_type &attribute)
Definition: xml_parser.hpp:192
token_range_t xml_element_echo(const token_range_t &entire_element_range, const token_range_t &, const attribute_set_t &, const token_range_t &)
Definition: xml_parser.hpp:622
token_range_t xml_element_strip(const token_range_t &, const token_range_t &, const attribute_set_t &, const token_range_t &value)
Definition: xml_parser.hpp:630
bool lower_bound(const key_type &key, set_type::const_iterator &result) const
Definition: xml_parser.hpp:135
callback_proc_t callback_m
Definition: xml_parser.hpp:612
void throw_exception(const char *error_string)
Definition: xml_parser.hpp:590
mapped_type operator[](const key_type &key) const
Definition: xml_parser.hpp:147
bool is_bom(token_range_t &bom)
void insert(const key_type &key, const mapped_type &value)
Definition: xml_parser.hpp:224
A relatively lightweight and simple xml (subset) parser.
Definition: xml_parser.hpp:442
bool is_attribute_set(attribute_set_t &attribute_set)
bool has_collisions(const attribute_set_t &other_set) const
Definition: xml_parser.hpp:391
OutputIterator set_union(const InputRange1 &range1, const InputRange2 &range2, OutputIterator result)
set implementation
Definition: set.hpp:82
const line_position_t & next_position()
Definition: xml_parser.hpp:484
void set_preorder_predicate(preorder_predicate_t pred)
Definition: xml_parser.hpp:492
bool lower_bound(const key_type &key, set_type::iterator &result)
Definition: xml_parser.hpp:118
set_type::const_iterator const_iterator
Definition: xml_parser.hpp:56
void throw_exception(xml_lex_token_set_t found, xml_lex_token_set_t expected)
Definition: xml_parser.hpp:592
xml_parser_t(const xml_parser_t &rhs)
Definition: xml_parser.hpp:462
void content_callback(token_range_t &result_element, const token_range_t &old_element, const token_range_t &start_tag, const attribute_set_t attribute_set, const token_range_t &content, bool preorder_parent)
Definition: xml_parser.hpp:735
boost::function< implementation_xml_element_proc_t > xml_element_proc_t
Definition: xml_parser.hpp:436
InputIterator datoi(InputIterator first, InputIterator last, Result &result)
bool is_content(token_range_t &element)
Definition: xml_parser.hpp:908
bool lower_bound(const value_type &attribute, set_type::iterator &result)
Definition: xml_parser.hpp:99
const token_type & get_token()
Definition: xml_parser.hpp:578
bool lower_bound(const value_type &attribute, set_type::const_iterator &result) const
Definition: xml_parser.hpp:124
xml_parser_t< O > make_xml_parser(uchar_ptr_t first, uchar_ptr_t last, const line_position_t &position, typename xml_parser_t< O >::preorder_predicate_t predicate, typename xml_parser_t< O >::callback_proc_t callback, O output)
Create an object that will parse the indicated content range using the preorder and content functions...
std::size_t count_collisions(const attribute_set_t &other_set) const
Definition: xml_parser.hpp:409
void require_token(xml_lex_token_set_t name, token_range_t &value)
Definition: xml_parser.hpp:711
std::size_t count_same(const attribute_set_t &other_set, bool mapped_matters=true) const
Definition: xml_parser.hpp:364
boost::function< bool(const token_range_t &)> preorder_predicate_t
Definition: xml_parser.hpp:446
token_range_t mapped_type
Definition: xml_parser.hpp:52
boost::range_size< Selection >::type size(const Selection &x)
const unsigned char * uchar_ptr_t
bool is_attribute(token_range_t &name, token_range_t &value)
xml_element_proc_t callback_proc_t
Definition: xml_parser.hpp:445
bool is_xml_decl(token_range_t &xml_decl)
An associated array based on adobe::token_range_t. A utility class for the xml_parser_t.
Definition: xml_parser.hpp:49

Copyright © 2006-2007 Adobe Systems Incorporated.

Use of this website signifies your agreement to the Terms of Use and Online Privacy Policy.

Search powered by Google