Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <iostream>
- #include <fstream>
- #include <string>
- using namespace std;
- void remove_header(string &line);
- void remove_footer(string &line);
- bool contains_func(string line);
- int find_pos(string line);
- /*Reads a HTML document and removes all HTML elements and outputs a text file with plain text*/
- int main()
- {
- string line;
- ifstream htmlStream("test_file.html");
- ofstream output("output.txt");
- if (htmlStream.is_open())
- {
- while ( htmlStream.good() )
- {
- getline (htmlStream,line);
- if (contains_func(line)== true)
- {
- remove_header(line);
- }
- if (contains_func(line)== true)
- {
- remove_footer(line);
- }
- if (output.is_open())
- {
- output<<line<<"\n";
- }
- }
- }
- htmlStream.close();
- output.close();
- return 0;
- }
- /**
- *Takes in a string and removes the start tag of the html element ex. <html>
- @param &line A line of code taken from a HTML file
- */
- void remove_header(string &line)
- {
- int end_pos=0;
- int start_pos = find_pos(line);
- if (start_pos == 0)
- {
- for (int j=0;j<line.length();j++)
- {
- if (line[j]!='>')
- {
- end_pos++;
- }
- else
- {
- end_pos++;
- break;
- }
- }
- }
- // if HTML element desginates a list add a dash instead of just removing the element
- if (line.substr(start_pos,end_pos) == "<li>")
- {
- line.replace(start_pos,end_pos,"- ");
- }
- else
- line.erase(start_pos,end_pos);
- }
- /**
- *Takes in a string and removes the end tag of the html element ex. </html>
- @param &line A line of code taken from a HTML file
- */
- void remove_footer(string &line)
- {
- int end_pos=0;
- int start_pos = find_pos(line);
- if (start_pos > 0)
- {
- for (int j=0;j<line.length();j++)
- {
- if (line[j]!='>')
- {
- end_pos++;
- }
- else
- {
- end_pos++;
- break;
- }
- }
- }
- line.erase(start_pos,end_pos);
- }
- /**
- *Takes in a string and finds the position where the HTML element begins
- @param line A line of code taken from a HTML file
- @return The position where the HTML element starts
- */
- int find_pos(string line)
- {
- for (int i = 0;i<line.length();i++)
- {
- if(line[i] == '<')
- {
- return i;
- }
- }
- }
- /**
- *Determines if the string contains an HTML element
- *Used to determine if remove functions should be implemented
- @param line A line of code taken from a HTML file
- @return true If the string contains an HTML element ex. '<html>'
- */
- bool contains_func(string line)
- {
- for (int i = 0;i<line.length();i++)
- {
- while(line[i] != '<')
- {
- break;
- }
- if (line[i] == '<')
- {
- return true;
- }
- }
- }
Add Comment
Please, Sign In to add comment