Advertisement
Guest User

Untitled

a guest
Oct 24th, 2015
159
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Perl 6 8.96 KB | None | 0 0
  1. use v6;
  2.  
  3. use NativeCall;
  4. use XML;
  5.  
  6. #use GumboTag;
  7.  
  8. module Gumbo {
  9.   class gumbo_node_t is repr('CPointer') {};
  10.   class gumbo_output_t is repr('CPointer') {};
  11.   class gumbo_attribute_t is repr('CPointer') {};
  12.  
  13.  
  14.   enum gumbo_node_type (
  15.      GUMBO_NODE_DOCUMENT => 0,
  16.      GUMBO_NODE_ELEMENT => 1,
  17.      GUMBO_NODE_TEXT => 2,
  18.      GUMBO_NODE_CDATA => 3,
  19.      GUMBO_NODE_COMMENT => 4,
  20.      GUMBO_NODE_WHITESPACE => 5,
  21.      GUMBO_NODE_TEMPLATE => 6
  22.   );
  23.  
  24. #   typedef struct {
  25. #    68   unsigned int line;
  26. #    69   unsigned int column;
  27. #    70   unsigned int offset;
  28. #    71 } GumboSourcePosition;
  29.  
  30.   class gumbo_source_position is repr('CStruct') {
  31.     has uint32  $.line;
  32.     has uint32  $.column;
  33.     has uint32  $.offset;
  34.   }
  35. #   typedef struct {
  36. #    90   const char* data;
  37. #    91
  38. #    93   size_t length;
  39. #    94 } GumboStringPiece;
  40. #  
  41.   class gumbo_string_piece_s is repr('CStruct') {
  42.     has str     $.data;
  43.     has uint32      $.length;
  44.   }
  45.  
  46.   #    typedef struct {
  47. #      void** data;
  48. #    
  49. #      unsigned int length;
  50. #    
  51. #      unsigned int capacity;
  52. #    } GumboVector;
  53.  
  54.   class gumbo_vector_s is repr('CStruct') {
  55.     has OpaquePointer $.data;
  56.     has uint32        $.length;
  57.     has uint32        $.capacity;
  58.   }
  59.  
  60.     #typedef struct {
  61.   #   GumboVector /* GumboNode* */ children;
  62.   #
  63.   #   // True if there was an explicit doctype token as opposed to it being omitted.
  64. #      bool has_doctype;
  65. #    
  66. #      // Fields from the doctype token, copied verbatim.
  67. #      const char* name;
  68. #      const char* public_identifier;
  69. #      const char* system_identifier;
  70. #    
  71. #      GumboQuirksModeEnum doc_type_quirks_mode;
  72. #    } GumboDocument;
  73. #  
  74.   class gumbo_document_s is repr('CStruct') {
  75.      HAS gumbo_vector_s $.children;
  76.      has int8       $.has_doctype;
  77.      has str        $.name;
  78.      has str        $.public_identifier;
  79.      has str        $.system_identifier;
  80.      has int32      $.doc_type_quirks_mode;
  81.   }
  82.  
  83.  
  84. #   typedef struct {
  85. #   231   GumboAttributeNamespaceEnum attr_namespace;
  86. #   232
  87. #   237   const char* name;
  88. #   238
  89. #   243   GumboStringPiece original_name;
  90. #   244
  91. #   251   const char* value;
  92. #   252
  93. #   261   GumboStringPiece original_value;
  94. #   262
  95. #   264   GumboSourcePosition name_start;
  96. #   265
  97. #   271   GumboSourcePosition name_end;
  98. #   272
  99. #   274   GumboSourcePosition value_start;
  100. #   275
  101. #   277   GumboSourcePosition value_end;
  102. #   278 } GumboAttribute;
  103. #   279
  104. #  
  105.   class gumbo_attribute_s is repr('CStruct') {
  106.     has int32           $.attr_namespace;
  107.     has str         $.name;
  108.     HAS gumbo_string_piece_s    $.original_name;
  109.     has str         $.value;
  110.     HAS gumbo_string_piece_s    $.original_value;
  111.     HAS gumbo_source_position   $.name_start;
  112.     HAS gumbo_source_position   $.name_end;
  113.     HAS gumbo_source_position   $.value_start;
  114.     HAS gumbo_source_position   $.value_end;
  115.   }
  116.  
  117. #   typedef struct {
  118. #   453   const char* text;
  119. #   454
  120. #   459   GumboStringPiece original_text;
  121. #   460
  122. #   465   GumboSourcePosition start_pos;
  123. #   466 } GumboText;
  124.  
  125.     class gumbo_text_s is repr('CStruct') {
  126.       has str           $.text;
  127.       HAS gumbo_string_piece_s  $.original_text;
  128.       HAS gumbo_source_position $.start_pos;
  129.     }
  130.    
  131. #      typedef struct {
  132. #   477   GumboVector /* GumboNode* */ children;
  133. #   478
  134. #   480   GumboTag tag;
  135. #   481
  136. #   483   GumboNamespaceEnum tag_namespace;
  137. #   484
  138. #   491   GumboStringPiece original_tag;
  139. #   492
  140. #   498   GumboStringPiece original_end_tag;
  141. #   499
  142. #   501   GumboSourcePosition start_pos;
  143. #   502
  144. #   504   GumboSourcePosition end_pos;
  145. #   505
  146. #   510   GumboVector /* GumboAttribute* */ attributes;
  147. #   511 } GumboElement;
  148.  
  149.   class gumbo_element_s is repr('CStruct') {
  150.     HAS gumbo_vector_s      $.children;
  151.     has int32           $.tag;
  152.     has int32           $.tag_namespace;
  153.     HAS gumbo_string_piece_s    $.original_tag;
  154.     HAS gumbo_string_piece_s    $.original_end_tag;
  155.     HAS gumbo_source_position   $.start_pos;
  156.     HAS gumbo_source_position   $.end_pos;
  157.     HAS gumbo_vector_s      $.attributes;
  158.   }
  159.  
  160. #   struct GumboInternalNode {
  161. #     GumboNodeType type;
  162. #  
  163. #     GumboNode* parent;
  164. #  
  165. #      size_t index_within_parent;
  166. #    
  167. #     GumboParseFlags parse_flags;
  168. #    
  169. #      union {
  170. #        GumboDocument document;  // For GUMBO_NODE_DOCUMENT.
  171. #        GumboElement element;    // For GUMBO_NODE_ELEMENT.
  172. #        GumboText text;          // For everything else.
  173. #      } v;
  174. #    };
  175.   class g_node_union is repr('CUnion') {
  176.     HAS gumbo_document_s    $.document;
  177.     HAS gumbo_element_s     $.element;
  178.     HAS gumbo_text_s        $.text;
  179.   }
  180.  
  181.   class gumbo_node_s is repr('CStruct') {
  182.     has int32       $.type;
  183.     has gumbo_node_s    $.parent;
  184.     has uint32      $.index_within_parent;
  185.     has int32       $.parse_flags;
  186.     HAS g_node_union    $.v;
  187.   }
  188.  
  189.   class gumbo_vector_t is repr('CPointer') {};
  190.  
  191.  
  192.  
  193.  
  194. #   typedef struct GumboInternalOutput {
  195. #      GumboNode* document;
  196. #    
  197. #      GumboNode* root;
  198. #    
  199. #      GumboVector /* GumboError */ errors;
  200. #    } GumboOutput;
  201. #  
  202.   class gumbo_output_s is repr('CStruct') {
  203.     has gumbo_node_t $.document;
  204.     has gumbo_node_t $.root;
  205.     HAS gumbo_vector_s $.errors;
  206.   }
  207.  
  208.  
  209.   sub gumbo_parse(Str) is native('libgumbo') returns gumbo_output_t { * }
  210.   sub gumbo_normalized_tagname(int32) is native('libgumbo') returns str { * }
  211.  
  212.   sub gumbo-type-size {
  213.     for gumbo_output_s, gumbo_vector_s, gumbo_attribute_s, gumbo_document_s, gumbo_element_s, gumbo_node_s, gumbo_source_position, gumbo_string_piece_s, gumbo_text_s -> $type {
  214.       say $type.perl~" : "~nativesizeof($type);
  215.     }
  216.   }
  217.  
  218.   sub parse-html (Str $html) is export {
  219.     say $html;
  220.     my $xmlroot;
  221.     gumbo-type-size();
  222.     my gumbo_output_t $gumbo_output = gumbo_parse($html);
  223.  
  224.     say $gumbo_output.perl;
  225.     my gumbo_output_s $go = nativecast(gumbo_output_s, $gumbo_output);
  226.     say $go.perl;
  227.     my gumbo_node_s $groot = nativecast(gumbo_node_s, $go.root);
  228.     say $groot.type;
  229.     if ($groot.type eq GUMBO_NODE_ELEMENT.value) {
  230.       $xmlroot = build-element($groot.v.element);
  231.       my $tab_child = nativecast(CArray[gumbo_node_t], $groot.v.element.children.data);
  232.       loop (my $i = 0; $i < $groot.v.element.children.length; $i++) {
  233.     build-tree(nativecast(gumbo_node_s, $tab_child[$i]), $xmlroot);
  234.       }
  235.     }
  236.     print_xml($xmlroot);
  237.     #say $go.root.perl;
  238.     #my gumbo_vector_s $vec = $go.errors;
  239.     #say $vec.perl;
  240.   }
  241.  
  242.   sub   build-tree(gumbo_node_s $node, XML::Element $parent is rw) {
  243.     given $node.type {
  244.       when GUMBO_NODE_ELEMENT.value {
  245.         my $xml = build-element($node.v.element);
  246.         $parent.append($xml);
  247.         my $tab_child = nativecast(CArray[gumbo_node_t], $node.v.element.children.data);
  248.     loop (my $i = 0; $i < $node.v.element.children.length; $i++) {
  249.       build-tree(nativecast(gumbo_node_s, $tab_child[$i]), $xml);
  250.     }
  251.       }
  252.       when GUMBO_NODE_TEXT.value {
  253.         my $xml = XML::Text.new(text => $node.v.text.text);
  254.         #$xml.text = $node.v.text.text;
  255.         $parent.append($xml);
  256.       }
  257.     }
  258.   }
  259.  
  260.   sub build-element(gumbo_element_s $elem) {
  261.     my $xml = XML::Element.new;
  262.     $xml.name = gumbo_normalized_tagname($elem.tag);
  263.     say $elem.attributes.^name;
  264.     say $elem.attributes.defined;
  265.     return $xml unless $elem.attributes.defined;
  266.     say $xml.name ~"- attr number : "~$elem.attributes.length;
  267.     my $tab_attr = nativecast(CArray[gumbo_attribute_t], $elem.attributes.data);
  268.     loop (my $i = 0; $i < $elem.attributes.length; $i++) {
  269.       my $cattr = nativecast(gumbo_attribute_s, $tab_attr[$i]);
  270.       say $cattr.attr_namespace;
  271.       say $cattr.value;
  272.       say $cattr.name;
  273.       $xml.attribs{$cattr.name} = $cattr.value;
  274.     }
  275.     return $xml;
  276.   }
  277.   sub     print_xml ($xmldoc, $cpt = 0){
  278.     if $xmldoc ~~ XML::Comment {
  279.       return ;
  280.     }
  281.     if $xmldoc ~~ XML::Text {
  282.       return ;
  283.     }
  284.     say "--" x $cpt, "<" , $xmldoc.name, $xmldoc.attribs.keys.join(',') if $xmldoc ~~ XML::Element;
  285.     return if ! $xmldoc.nodes.Bool;
  286.     for $xmldoc.nodes -> $mychild {
  287.        print_xml($mychild, $cpt + 1);
  288.     }
  289.     say "--" x $cpt, "</" , $xmldoc.name if $xmldoc ~~ XML::Element;
  290. }
  291.  
  292. }
  293.  
  294.  
  295. root@testperl6:~/piko# perl6 gumbo.pl
  296. <html><head class='piko'><title>piko</title></html>
  297. gumbo_output_s : 20
  298. gumbo_vector_s : 12
  299. gumbo_attribute_s : 76
  300. gumbo_document_s : 32
  301. gumbo_element_s : 72
  302. gumbo_node_s : 88
  303. gumbo_source_position : 12
  304. gumbo_string_piece_s : 8
  305. gumbo_text_s : 24
  306. gumbo_output_t.new
  307. gumbo_output_s.new(document => gumbo_node_t.new, root => gumbo_node_t.new, errors => gumbo_vector_s.new(data => Pointer.new(263536920), length => 1, capacity => 5))
  308. 1
  309. gumbo_vector_s
  310. True
  311. html- attr number : 0
  312. gumbo_vector_s
  313. True
  314. head- attr number : 1
  315. 0
  316. String corruption detected: bad storage type
  317.   in sub build-element at /root/piko/Gumbo.pm6:270
  318.   in block  at /root/piko/Gumbo.pm6:244
  319.   in sub build-tree at /root/piko/Gumbo.pm6:242
  320.   in sub parse-html at /root/piko/Gumbo.pm6:232
  321.   in block <unit> at gumbo.pl:4
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement