Guest User

Untitled

a guest
Aug 10th, 2018
92
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.36 KB | None | 0 0
  1. Using Boost.Spirit to extract certain tags/attributes from HTML
  2. (?i:<imgs[^>]*srcs*=s*[""']([^<][^""']+)[^>]*s*/*>)
  3.  
  4. sehe@natty:/tmp$ time ./expressive < bench > /dev/null
  5.  
  6. real 0m2.146s
  7. user 0m2.110s
  8. sys 0m0.030s
  9.  
  10. typedef std::string::const_iterator It;
  11.  
  12. int main(int argc, const char *argv[])
  13. {
  14. using namespace boost::xpressive;
  15. #if DYNAMIC
  16. const sregex re = sregex::compile
  17. ("<img\s+[^\>]*?src\s*=\s*(["'])(.*?)\1");
  18. #else
  19. const sregex re = "<img" >> +_s >> -*(~(set = '\','>')) >>
  20. "src" >> *_s >> '=' >> *_s
  21. >> (s1 = as_xpr('"') | ''') >> (s2 = -*_) >> s1;
  22. #endif
  23.  
  24. std::string s;
  25. smatch what;
  26.  
  27. while (std::getline(std::cin, s))
  28. {
  29. It f = s.begin(), l = s.end();
  30.  
  31. do
  32. {
  33. if (!regex_search(f, l, what, re))
  34. break;
  35.  
  36. handle_attr("img", "src", what[2]);
  37. f = what[0].second;
  38. } while (f!=s.end());
  39. }
  40.  
  41. return 0;
  42. }
  43.  
  44. typedef std::string::const_iterator It;
  45.  
  46. int main(int argc, const char *argv[])
  47. {
  48. const boost::regex re("<img\s+[^\>]*?src\s*=\s*(["'])(.*?)\1");
  49.  
  50. std::string s;
  51. boost::smatch what;
  52.  
  53. while (std::getline(std::cin, s))
  54. {
  55. It f = s.begin(), l = s.end();
  56.  
  57. do
  58. {
  59. if (!boost::regex_search(f, l, what, re))
  60. break;
  61.  
  62. handle_attr("img", "src", what[2]);
  63. f = what[0].second;
  64. } while (f!=s.end());
  65. }
  66.  
  67. return 0;
  68. }
  69.  
  70. ./test < index.htm
  71.  
  72. sehe@natty:/tmp$ time ./spirit < bench > /dev/null
  73.  
  74. real 0m3.895s
  75. user 0m3.820s
  76. sys 0m0.070s
  77.  
  78. //#define BOOST_SPIRIT_DEBUG
  79. #include <string>
  80. #include <iostream>
  81. #include <boost/spirit/include/qi.hpp>
  82. #include <boost/spirit/include/phoenix.hpp>
  83.  
  84. namespace qi = boost::spirit::qi;
  85. namespace phx = boost::phoenix;
  86.  
  87. void handle_attr(
  88. const std::string& elem,
  89. const std::string& attr,
  90. const std::string& value)
  91. {
  92. if (elem == "img" && attr == "src")
  93. std::cout << "value : " << value << std::endl;
  94. }
  95.  
  96. typedef std::string::const_iterator It;
  97. typedef qi::space_type Skipper;
  98.  
  99. struct grammar : qi::grammar<It, Skipper>
  100. {
  101. grammar() : grammar::base_type(html)
  102. {
  103. using namespace boost::spirit::qi;
  104. using phx::bind;
  105.  
  106. attr = as_string [ +~char_("= trn/>") ] [ _a = _1 ]
  107. >> '=' >> (
  108. as_string [ '"' >> lexeme [ *~char_('"') ] >> '"' ]
  109. | as_string [ "'" >> lexeme [ *~char_("'") ] >> "'" ]
  110. ) [ bind(handle_attr, _r1, _a, _1) ]
  111. ;
  112.  
  113. elem = lit('<')
  114. >> as_string [ lexeme [ ~char_("-/>") >> *(char_ - space - char_("/>")) ] ] [ _a = _1 ]
  115. >> *attr(_a);
  116.  
  117. html = (-elem) % +("</" | (char_ - '<'));
  118.  
  119. BOOST_SPIRIT_DEBUG_NODE(html);
  120. BOOST_SPIRIT_DEBUG_NODE(elem);
  121. BOOST_SPIRIT_DEBUG_NODE(attr);
  122. }
  123.  
  124. qi::rule<It, Skipper> html;
  125. qi::rule<It, Skipper, qi::locals<std::string> > elem;
  126. qi::rule<It, qi::unused_type(std::string), Skipper, qi::locals<std::string> > attr;
  127. };
  128.  
  129. int main(int argc, const char *argv[])
  130. {
  131. std::string s;
  132.  
  133. const static grammar html_;
  134.  
  135. while (std::getline(std::cin, s))
  136. {
  137. It f = s.begin(),
  138. l = s.end();
  139.  
  140. if (!phrase_parse(f, l, html_, qi::space) || (f!=l))
  141. std::cerr << "unparsed: " << std::string(f,l) << std::endl;
  142. }
  143.  
  144. return 0;
  145. }
Add Comment
Please, Sign In to add comment