daily pastebin goal
55%
SHARE
TWEET

Untitled

a guest Aug 10th, 2018 65 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. Using Boost.Spirit to extract certain tags/attributes from HTML
  2. (?i:<imgs[^>]*srcs*=s*[""']([^<][^""']+)[^>]*s*/*>)
  3.    
  4. sehe@natty:/tmp$ time ./expressive < bench > /dev/null
  5.  
  6. real    0m2.146s
  7. user    0m2.110s
  8. sys 0m0.030s
  9.    
  10. typedef std::string::const_iterator It;
  11.  
  12. int main(int argc, const char *argv[])
  13. {
  14.     using namespace boost::xpressive;
  15. #if DYNAMIC
  16.     const sregex re = sregex::compile
  17.          ("<img\s+[^\>]*?src\s*=\s*(["'])(.*?)\1");
  18. #else
  19.     const sregex re = "<img" >> +_s >> -*(~(set = '\','>')) >>
  20.         "src" >> *_s >> '=' >> *_s
  21.         >> (s1 = as_xpr('"') | ''') >> (s2 = -*_) >> s1;
  22. #endif
  23.  
  24.     std::string s;
  25.     smatch what;
  26.  
  27.     while (std::getline(std::cin, s))
  28.     {
  29.         It f = s.begin(), l = s.end();
  30.  
  31.         do
  32.         {
  33.             if (!regex_search(f, l, what, re))
  34.                 break;
  35.  
  36.             handle_attr("img", "src", what[2]);
  37.             f = what[0].second;
  38.         } while (f!=s.end());
  39.     }
  40.  
  41.     return 0;
  42. }
  43.    
  44. typedef std::string::const_iterator It;
  45.  
  46. int main(int argc, const char *argv[])
  47. {
  48.     const boost::regex re("<img\s+[^\>]*?src\s*=\s*(["'])(.*?)\1");
  49.  
  50.     std::string s;
  51.     boost::smatch what;
  52.  
  53.     while (std::getline(std::cin, s))
  54.     {
  55.         It f = s.begin(), l = s.end();
  56.  
  57.         do
  58.         {
  59.             if (!boost::regex_search(f, l, what, re))
  60.                 break;
  61.  
  62.             handle_attr("img", "src", what[2]);
  63.             f = what[0].second;
  64.         } while (f!=s.end());
  65.     }
  66.  
  67.     return 0;
  68. }
  69.    
  70. ./test < index.htm
  71.    
  72. sehe@natty:/tmp$ time ./spirit < bench > /dev/null
  73.  
  74. real    0m3.895s
  75. user    0m3.820s
  76. sys 0m0.070s
  77.    
  78. //#define BOOST_SPIRIT_DEBUG
  79. #include <string>
  80. #include <iostream>
  81. #include <boost/spirit/include/qi.hpp>
  82. #include <boost/spirit/include/phoenix.hpp>
  83.  
  84. namespace qi  = boost::spirit::qi;
  85. namespace phx = boost::phoenix;
  86.  
  87. void handle_attr(
  88.         const std::string& elem,
  89.         const std::string& attr,
  90.         const std::string& value)
  91. {
  92.     if (elem == "img" && attr == "src")
  93.         std::cout << "value : " << value << std::endl;
  94. }
  95.  
  96. typedef std::string::const_iterator It;
  97. typedef qi::space_type Skipper;
  98.  
  99. struct grammar : qi::grammar<It, Skipper>
  100. {
  101.     grammar() : grammar::base_type(html)
  102.     {
  103.         using namespace boost::spirit::qi;
  104.         using phx::bind;
  105.  
  106.         attr = as_string [ +~char_("= trn/>") ] [ _a = _1 ]
  107.                 >> '=' >> (
  108.                     as_string [ '"' >> lexeme [ *~char_('"') ] >> '"' ]
  109.                   | as_string [ "'" >> lexeme [ *~char_("'") ] >> "'" ]
  110.                   ) [ bind(handle_attr, _r1, _a, _1) ]
  111.             ;
  112.  
  113.         elem = lit('<')
  114.             >> as_string [ lexeme [ ~char_("-/>") >> *(char_ - space - char_("/>")) ] ] [ _a = _1 ]
  115.             >> *attr(_a);
  116.  
  117.         html = (-elem) % +("</" | (char_ - '<'));
  118.  
  119.         BOOST_SPIRIT_DEBUG_NODE(html);
  120.         BOOST_SPIRIT_DEBUG_NODE(elem);
  121.         BOOST_SPIRIT_DEBUG_NODE(attr);
  122.     }
  123.  
  124.     qi::rule<It, Skipper> html;
  125.     qi::rule<It, Skipper, qi::locals<std::string> > elem;
  126.     qi::rule<It, qi::unused_type(std::string), Skipper, qi::locals<std::string> > attr;
  127. };
  128.  
  129. int main(int argc, const char *argv[])
  130. {
  131.     std::string s;
  132.  
  133.     const static grammar html_;
  134.  
  135.     while (std::getline(std::cin, s))
  136.     {
  137.         It f = s.begin(),
  138.            l = s.end();
  139.  
  140.         if (!phrase_parse(f, l, html_, qi::space) || (f!=l))
  141.             std::cerr << "unparsed: " << std::string(f,l) << std::endl;
  142.     }
  143.  
  144.     return 0;
  145. }
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top