Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- use v6;
- use NativeCall;
- use XML;
- #use GumboTag;
- module Gumbo {
- class gumbo_node_t is repr('CPointer') {};
- class gumbo_output_t is repr('CPointer') {};
- class gumbo_attribute_t is repr('CPointer') {};
- enum gumbo_node_type (
- GUMBO_NODE_DOCUMENT => 0,
- GUMBO_NODE_ELEMENT => 1,
- GUMBO_NODE_TEXT => 2,
- GUMBO_NODE_CDATA => 3,
- GUMBO_NODE_COMMENT => 4,
- GUMBO_NODE_WHITESPACE => 5,
- GUMBO_NODE_TEMPLATE => 6
- );
- # typedef struct {
- # 68 unsigned int line;
- # 69 unsigned int column;
- # 70 unsigned int offset;
- # 71 } GumboSourcePosition;
- class gumbo_source_position is repr('CStruct') {
- has uint32 $.line;
- has uint32 $.column;
- has uint32 $.offset;
- }
- # typedef struct {
- # 90 const char* data;
- # 91
- # 93 size_t length;
- # 94 } GumboStringPiece;
- #
- class gumbo_string_piece_s is repr('CStruct') {
- has str $.data;
- has uint32 $.length;
- }
- # typedef struct {
- # void** data;
- #
- # unsigned int length;
- #
- # unsigned int capacity;
- # } GumboVector;
- class gumbo_vector_s is repr('CStruct') {
- has OpaquePointer $.data;
- has uint32 $.length;
- has uint32 $.capacity;
- }
- #typedef struct {
- # GumboVector /* GumboNode* */ children;
- #
- # // True if there was an explicit doctype token as opposed to it being omitted.
- # bool has_doctype;
- #
- # // Fields from the doctype token, copied verbatim.
- # const char* name;
- # const char* public_identifier;
- # const char* system_identifier;
- #
- # GumboQuirksModeEnum doc_type_quirks_mode;
- # } GumboDocument;
- #
- class gumbo_document_s is repr('CStruct') {
- HAS gumbo_vector_s $.children;
- has int8 $.has_doctype;
- has str $.name;
- has str $.public_identifier;
- has str $.system_identifier;
- has int32 $.doc_type_quirks_mode;
- }
- # typedef struct {
- # 231 GumboAttributeNamespaceEnum attr_namespace;
- # 232
- # 237 const char* name;
- # 238
- # 243 GumboStringPiece original_name;
- # 244
- # 251 const char* value;
- # 252
- # 261 GumboStringPiece original_value;
- # 262
- # 264 GumboSourcePosition name_start;
- # 265
- # 271 GumboSourcePosition name_end;
- # 272
- # 274 GumboSourcePosition value_start;
- # 275
- # 277 GumboSourcePosition value_end;
- # 278 } GumboAttribute;
- # 279
- #
- class gumbo_attribute_s is repr('CStruct') {
- has int32 $.attr_namespace;
- has str $.name;
- HAS gumbo_string_piece_s $.original_name;
- has str $.value;
- HAS gumbo_string_piece_s $.original_value;
- HAS gumbo_source_position $.name_start;
- HAS gumbo_source_position $.name_end;
- HAS gumbo_source_position $.value_start;
- HAS gumbo_source_position $.value_end;
- }
- # typedef struct {
- # 453 const char* text;
- # 454
- # 459 GumboStringPiece original_text;
- # 460
- # 465 GumboSourcePosition start_pos;
- # 466 } GumboText;
- class gumbo_text_s is repr('CStruct') {
- has str $.text;
- HAS gumbo_string_piece_s $.original_text;
- HAS gumbo_source_position $.start_pos;
- }
- # typedef struct {
- # 477 GumboVector /* GumboNode* */ children;
- # 478
- # 480 GumboTag tag;
- # 481
- # 483 GumboNamespaceEnum tag_namespace;
- # 484
- # 491 GumboStringPiece original_tag;
- # 492
- # 498 GumboStringPiece original_end_tag;
- # 499
- # 501 GumboSourcePosition start_pos;
- # 502
- # 504 GumboSourcePosition end_pos;
- # 505
- # 510 GumboVector /* GumboAttribute* */ attributes;
- # 511 } GumboElement;
- class gumbo_element_s is repr('CStruct') {
- HAS gumbo_vector_s $.children;
- has int32 $.tag;
- has int32 $.tag_namespace;
- HAS gumbo_string_piece_s $.original_tag;
- HAS gumbo_string_piece_s $.original_end_tag;
- HAS gumbo_source_position $.start_pos;
- HAS gumbo_source_position $.end_pos;
- HAS gumbo_vector_s $.attributes;
- }
- # struct GumboInternalNode {
- # GumboNodeType type;
- #
- # GumboNode* parent;
- #
- # size_t index_within_parent;
- #
- # GumboParseFlags parse_flags;
- #
- # union {
- # GumboDocument document; // For GUMBO_NODE_DOCUMENT.
- # GumboElement element; // For GUMBO_NODE_ELEMENT.
- # GumboText text; // For everything else.
- # } v;
- # };
- class g_node_union is repr('CUnion') {
- HAS gumbo_document_s $.document;
- HAS gumbo_element_s $.element;
- HAS gumbo_text_s $.text;
- }
- class gumbo_node_s is repr('CStruct') {
- has int32 $.type;
- has gumbo_node_s $.parent;
- has uint32 $.index_within_parent;
- has int32 $.parse_flags;
- HAS g_node_union $.v;
- }
- class gumbo_vector_t is repr('CPointer') {};
- # typedef struct GumboInternalOutput {
- # GumboNode* document;
- #
- # GumboNode* root;
- #
- # GumboVector /* GumboError */ errors;
- # } GumboOutput;
- #
- class gumbo_output_s is repr('CStruct') {
- has gumbo_node_t $.document;
- has gumbo_node_t $.root;
- HAS gumbo_vector_s $.errors;
- }
- sub gumbo_parse(Str) is native('libgumbo') returns gumbo_output_t { * }
- sub gumbo_normalized_tagname(int32) is native('libgumbo') returns str { * }
- sub gumbo-type-size {
- for gumbo_output_s, gumbo_vector_s, gumbo_attribute_s, gumbo_document_s, gumbo_element_s, gumbo_node_s, gumbo_source_position, gumbo_string_piece_s, gumbo_text_s -> $type {
- say $type.perl~" : "~nativesizeof($type);
- }
- }
- sub parse-html (Str $html) is export {
- say $html;
- my $xmlroot;
- gumbo-type-size();
- my gumbo_output_t $gumbo_output = gumbo_parse($html);
- say $gumbo_output.perl;
- my gumbo_output_s $go = nativecast(gumbo_output_s, $gumbo_output);
- say $go.perl;
- my gumbo_node_s $groot = nativecast(gumbo_node_s, $go.root);
- say $groot.type;
- if ($groot.type eq GUMBO_NODE_ELEMENT.value) {
- $xmlroot = build-element($groot.v.element);
- my $tab_child = nativecast(CArray[gumbo_node_t], $groot.v.element.children.data);
- loop (my $i = 0; $i < $groot.v.element.children.length; $i++) {
- build-tree(nativecast(gumbo_node_s, $tab_child[$i]), $xmlroot);
- }
- }
- print_xml($xmlroot);
- #say $go.root.perl;
- #my gumbo_vector_s $vec = $go.errors;
- #say $vec.perl;
- }
- sub build-tree(gumbo_node_s $node, XML::Element $parent is rw) {
- given $node.type {
- when GUMBO_NODE_ELEMENT.value {
- my $xml = build-element($node.v.element);
- $parent.append($xml);
- my $tab_child = nativecast(CArray[gumbo_node_t], $node.v.element.children.data);
- loop (my $i = 0; $i < $node.v.element.children.length; $i++) {
- build-tree(nativecast(gumbo_node_s, $tab_child[$i]), $xml);
- }
- }
- when GUMBO_NODE_TEXT.value {
- my $xml = XML::Text.new(text => $node.v.text.text);
- #$xml.text = $node.v.text.text;
- $parent.append($xml);
- }
- }
- }
- sub build-element(gumbo_element_s $elem) {
- my $xml = XML::Element.new;
- $xml.name = gumbo_normalized_tagname($elem.tag);
- say $elem.attributes.^name;
- say $elem.attributes.defined;
- return $xml unless $elem.attributes.defined;
- say $xml.name ~"- attr number : "~$elem.attributes.length;
- my $tab_attr = nativecast(CArray[gumbo_attribute_t], $elem.attributes.data);
- loop (my $i = 0; $i < $elem.attributes.length; $i++) {
- my $cattr = nativecast(gumbo_attribute_s, $tab_attr[$i]);
- say $cattr.attr_namespace;
- say $cattr.value;
- say $cattr.name;
- $xml.attribs{$cattr.name} = $cattr.value;
- }
- return $xml;
- }
- sub print_xml ($xmldoc, $cpt = 0){
- if $xmldoc ~~ XML::Comment {
- return ;
- }
- if $xmldoc ~~ XML::Text {
- return ;
- }
- say "--" x $cpt, "<" , $xmldoc.name, $xmldoc.attribs.keys.join(',') if $xmldoc ~~ XML::Element;
- return if ! $xmldoc.nodes.Bool;
- for $xmldoc.nodes -> $mychild {
- print_xml($mychild, $cpt + 1);
- }
- say "--" x $cpt, "</" , $xmldoc.name if $xmldoc ~~ XML::Element;
- }
- }
- root@testperl6:~/piko# perl6 gumbo.pl
- <html><head class='piko'><title>piko</title></html>
- gumbo_output_s : 20
- gumbo_vector_s : 12
- gumbo_attribute_s : 76
- gumbo_document_s : 32
- gumbo_element_s : 72
- gumbo_node_s : 88
- gumbo_source_position : 12
- gumbo_string_piece_s : 8
- gumbo_text_s : 24
- gumbo_output_t.new
- gumbo_output_s.new(document => gumbo_node_t.new, root => gumbo_node_t.new, errors => gumbo_vector_s.new(data => Pointer.new(263536920), length => 1, capacity => 5))
- 1
- gumbo_vector_s
- True
- html- attr number : 0
- gumbo_vector_s
- True
- head- attr number : 1
- 0
- String corruption detected: bad storage type
- in sub build-element at /root/piko/Gumbo.pm6:270
- in block at /root/piko/Gumbo.pm6:244
- in sub build-tree at /root/piko/Gumbo.pm6:242
- in sub parse-html at /root/piko/Gumbo.pm6:232
- in block <unit> at gumbo.pl:4
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement