Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- // import necessary crates
- extern crate html5ever;
- extern crate reqwest;
- use std::default::Default;
- // import necessary modules from html5ever
- use html5ever::tendril::*;
- use html5ever::tokenizer::BufferQueue;
- use html5ever::tokenizer::{TagToken, StartTag, EndTag};
- use html5ever::tokenizer::{Token, TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts,};
- use html5ever::tokenizer::CharacterTokens;
- // define a struct to hold the state of the parser
- struct TokenPrinter {
- // define flags to track token location.
- in_price_tag: bool,
- in_span_tag: bool,
- in_bdi_tag: bool,
- price: String, // string to hold the price
- }
- // implement the TokenSink trait for TokenPrinter
- impl TokenSink for TokenPrinter {
- type Handle = ();
- // define function to process each token in the HTML document
- fn process_token(&mut self, token: Token, _line_number: u64) -> TokenSinkResult<()> {
- match token {
- TagToken(tag) => {
- // if the token is a start tag...
- if tag.kind == StartTag {
- // ...and the tag is a <p> tag with class "price"...
- if tag.name.to_string() == "p" {
- for attr in tag.attrs {
- if attr.name.local.to_string() == "class" && attr.value.to_string() == "price" {
- // ...set the in_price_tag flag to true
- self.in_price_tag = true;
- }
- }
- // if we're inside a <p class="price"> tag and the tag is a <span> tag...
- } else if self.in_price_tag && tag.name.to_string() == "span" {
- // ...set the in_span_tag flag to true
- self.in_span_tag = true;
- // if we're inside a <p class="price"> tag and the tag is a <bdi> tag...
- } else if self.in_price_tag && tag.name.to_string() == "bdi" {
- // ...set the in_bdi_tag flag to true
- self.in_bdi_tag = true;
- }
- // if the token is an end tag...
- } else if tag.kind == EndTag {
- // ...and the tag is a <p>, <span>, or <bdi> tag...
- if tag.name.to_string() == "p" {
- // ...set the corresponding flag to false
- self.in_price_tag = false;
- } else if tag.name.to_string() == "span" {
- self.in_span_tag = false;
- } else if tag.name.to_string() == "bdi" {
- self.in_bdi_tag = false;
- }
- }
- },
- // if the token is a character token (i.e., text)...
- CharacterTokens(s) => {
- // ...and we're inside a <p class="price"> tag...
- if self.in_price_tag {
- // ...and we're inside a <span> tag...
- if self.in_span_tag {
- // ...add the text to the price string
- self.price = format!("price: {}", s);
- // ...and we're inside a <bdi> tag...
- } else if self.in_bdi_tag {
- // ...append the text to the price string and print it
- self.price = format!("{}{}", self.price, s);
- println!("{}", self.price);
- // clear the price string for the next price
- self.price.clear();
- }
- }
- },
- // ignore all other tokens
- _ => {},
- }
- // continue processing tokens
- TokenSinkResult::Continue
- }
- }
- #[tokio::main]
- async fn main() -> Result<(), Box<dyn std::error::Error>> {
- // initialize the TokenPrinter
- let sink = TokenPrinter { in_price_tag: false, in_span_tag: false, in_bdi_tag: false, price: String::new() };
- // retrieve HTML content from target website
- //... let resp = reqwest::get("https://www.scrapingcourse.com/ecommerce/product/adrienne-trek-jacket/").await?.text().await?;
- // convert the HTML content to a ByteTendril
- let chunk = ByteTendril::from(resp.as_bytes());
- let mut input = BufferQueue::new();
- input.push_back(chunk.try_reinterpret::<fmt::UTF8>().unwrap());
- // initialize the Tokenizer with the TokenPrinter
- let mut tok = Tokenizer::new(
- sink,
- TokenizerOpts::default(),
- );
- // feed the HTML content to the Tokenizer
- let _ = tok.feed(&mut input);
- assert!(input.is_empty());
- // end tokenization
- tok.end();
- Ok(())
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement