Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- using System;
- using System.Collections.Generic;
- using System.Diagnostics;
- using System.Globalization;
- using System.IO;
- using System.Linq;
- using ExCSS;
- using HtmlAgilityPack;
- using Newtonsoft.Json;
- namespace QADirector
- {
- internal class HtmlToJsonConverter
- {
- private readonly FileInfo _inputFile;
- private readonly FileInfo _outputFile;
- /// <summary>
- /// Creates an object that can convert a QA Director HTML export to a JSON object.
- /// </summary>
- /// <param name="source">HTML report to be converted.</param>
- /// <param name="destination">JSON file name to be created</param>
- public HtmlToJsonConverter(FileInfo source, FileInfo destination)
- {
- _outputFile = destination;
- _inputFile = source;
- }
- /// <summary>
- /// Converts the HTML input file into JSON and writes the output file
- /// </summary>
- public void Convert()
- {
- var htmlDocument = ReadFileToHtmlDocument();
- var pages = GetPages(htmlDocument);
- Debug.WriteLine("Found {0} pages", pages.Count());
- var objects = ParseHtmlToObjects(pages);
- WriteJson(objects);
- }
- /// <summary>
- /// Write the object as JSON using a custom converter that
- /// correctly writes the property list as single properties
- /// and not as a list.
- /// </summary>
- /// <param name="obj">Object to be serialized</param>
- private void WriteJson(DataObject obj)
- {
- var converter = new CustomJsonConverter();
- var data = JsonConvert.SerializeObject(obj, Formatting.Indented, converter);
- File.WriteAllText(_outputFile.FullName, data);
- }
- /// <summary>
- /// Parses the HTML pages and creates an equivalent object by parsing the properties and values
- /// as well as the properties and values of the sub items
- /// </summary>
- /// <param name="pages">HTML pages to be analyzed. <see cref="GetPages"/>.</param>
- /// <returns>An object containing properties and child objects.</returns>
- private static DataObject ParseHtmlToObjects(IEnumerable<HtmlNode> pages)
- {
- var rootObject = new DataObject();
- var currentObject = rootObject;
- // Go through all pages
- foreach (var page in pages)
- {
- // Analyze all the span tags
- var steps = page.Descendants().Where(x => (x.Name == "span")).ToList();
- string key = null;
- foreach (var step in steps)
- {
- if (!IsKey(step))
- {
- // If this is not a key, the key was detected before. Use it to populate the object
- currentObject = AddProperty(currentObject, key, GetTextFromSpan(step));
- key = null;
- }
- else
- {
- if (key != null)
- {
- // Special case: we detected a new key, although the old key has not been used as property yet
- // This can happen for keys without value, so add it empty.
- currentObject = AddProperty(currentObject, key, "");
- }
- key = GetKeyFromNode(step);
- var level = GetIndentationFromNode(step);
- if (level == currentObject.level)
- {
- // Still the same level.
- // Remain at the same object in order to add more properties.
- }
- else if (level > currentObject.level)
- {
- // Decend to lower level: create a new child
- var child = new DataObject {level = level, Parent = currentObject};
- currentObject.Children.Add(child);
- currentObject = child;
- }
- else
- {
- // Move up until level is equal
- while (level < currentObject.level)
- {
- currentObject = currentObject.Parent;
- }
- }
- }
- }
- }
- return rootObject;
- }
- /// <summary>
- /// Adds a property to an object.
- /// If that property already exists, create a new object and add the property there
- /// </summary>
- /// <param name="obj">Object to add the property to</param>
- /// <param name="key">Name of the property</param>
- /// <param name="value">Value of the property</param>
- /// <returns>The same object if the property did not exist yet, the new object if the property already existed</returns>
- private static DataObject AddProperty(DataObject obj, string key, string value)
- {
- // Special case: <Span> which contains the page information. Skip it.
- if (key == null) return obj;
- if (obj.Properties.ContainsKey(key))
- {
- // This key was already assigned, so this must be a new object on the same level
- var sibling = new DataObject {level = obj.level, Parent = obj.Parent};
- obj.Parent.Children.Add(sibling);
- obj = sibling;
- }
- obj.Properties.Add(key, value);
- return obj;
- }
- /// <summary>
- /// Gets the indentation from the HTML node.
- /// Detection is done via the left CSS style of the HTML element.
- /// </summary>
- /// <param name="step">HTML node to analyze</param>
- /// <returns>Indentation value, 0 if no left attribute was found</returns>
- private static decimal GetIndentationFromNode(HtmlNode step)
- {
- decimal thisIndent=0;
- var styleSheet = ExtractStyle(step);
- foreach (var cssAttribute in styleSheet.StyleRules[0].Declarations)
- {
- if (cssAttribute.Name != "left") continue;
- var numberFormatInfo = new NumberFormatInfo {NumberDecimalSeparator = "."};
- thisIndent = decimal.Parse(cssAttribute.Term.ToString().Replace("in", ""), numberFormatInfo);
- Debug.WriteLine("Left attribute found:" + cssAttribute.Term);
- }
- return thisIndent;
- }
- /// <summary>
- /// Check if the node contains a key and if so, return its text.
- /// </summary>
- /// <param name="span">HTML SPAN node to be analyzed</param>
- /// <returns>Text of the key if key was detected, <c>null</c> otherwise.
- /// Text is cleaned from HTML entities and has the trailing colon removed.</returns>
- private static string GetKeyFromNode(HtmlNode span)
- {
- return IsKey(span) ? GetTextFromSpan(span).Trim(':') : null;
- }
- /// <summary>
- /// Check if the node contains a key.
- /// Detection is done via the <c>font-weight="bold"</c> CSS style of the HTML element.
- /// </summary>
- /// <param name="span">HTML SPAN node to be analyzed</param>
- /// <returns><c>True</c> if font-weight was bold, <c>false</c> otherwise.</returns>
- private static bool IsKey(HtmlNode span)
- {
- var styleSheet = ExtractStyle(span);
- foreach (var cssAttribute in styleSheet.StyleRules[0].Declarations)
- {
- if (cssAttribute.Name != "font-weight" || cssAttribute.Term.ToString() != "bold") continue;
- return true;
- }
- return false;
- }
- /// <summary>
- /// Gets the text from a HTML SPAN element.
- /// This includes removal of the <c>NOBR</c> tag and decoding of HTML special characters.
- /// </summary>
- /// <param name="span">SPAN element to get the text from.</param>
- /// <returns>Inner text of the SPAN node</returns>
- private static string GetTextFromSpan(HtmlNode span)
- {
- var text = span.Descendants("nobr").First().InnerText; // There is only one <nobr>
- text = System.Net.WebUtility.HtmlDecode(text);
- return text;
- }
- /// <summary>
- /// Extracts the information of the <c>style</c> attribute
- /// </summary>
- /// <param name="element">HTML element to extract the style information from</param>
- /// <returns></returns>
- private static StyleSheet ExtractStyle(HtmlNode element)
- {
- var rawStyle = element.Attributes["style"].Value;
- var styleSheet = new Parser().Parse(String.Format(".dummy{{{0}}}", rawStyle));
- return styleSheet;
- }
- /// <summary>
- /// Gets the pages, identified by a DIV element defining the width and height
- /// inside another DIV element
- /// (the one with style="page-break-inside:avoid;page-break-after:always;")
- /// </summary>
- /// <param name="htmlDocument">HTML to analyze</param>
- /// <returns>DIV nodes corresponding to pages, using the inner of the described DIVs (the one defining width and height)</returns>
- private static IEnumerable<HtmlNode> GetPages(HtmlDocument htmlDocument)
- {
- var pages =
- htmlDocument.DocumentNode.Descendants()
- .Where(x =>(x.Name == "div" && x.Ancestors("div").Count() == 1));
- return pages;
- }
- /// <summary>
- /// Reads the file contents from disk and converts it into a HTML document
- /// </summary>
- /// <returns>HTML document as read from disk</returns>
- private HtmlDocument ReadFileToHtmlDocument()
- {
- var source = File.ReadAllText(_inputFile.FullName);
- var html = new HtmlDocument();
- html.LoadHtml(source);
- return html;
- }
- }
- }
- foreach (var cssAttribute in styleSheet.StyleRules[0].Declarations)
- {
- if (cssAttribute.Name != "font-weight" || cssAttribute.Term.ToString() != "bold") continue;
- return true;
- }
- return false;
- foreach (var cssAttribute in styleSheet.StyleRules[0].Declarations)
- {
- if (cssAttribute.Name == "font-weight" && cssAttribute.Term.ToString() == "bold")
- {
- return true;
- }
- }
- return false;
- private static bool IsKey(HtmlNode span)
- {
- var styleSheet = ExtractStyle(span);
- bool isKey = styleSheet.StyleRules[0].Declarations
- .Any(cssAttribute.Name == "font-weight" && cssAttribute.Term.ToString() == "bold");
- return isKey;
- }
- private static DataObject ParseHtmlToObjects(IEnumerable<HtmlNode> pages)
- {
- var rootObject = new DataObject();
- var currentObject = rootObject;
- foreach (var page in pages)
- {
- var steps = page.Descendants().Where(x => (x.Name == "span")).ToList();
- currentObject = AnalyzeSpanTags(steps, currentObject);
- }
- return rootObject;
- }
- private static DataObject AnalyzeSpanTags(IEnumerable<HtmlNode> steps, DataObject currentObject)
- {
- string key = null;
- foreach (var step in steps)
- {
- if (!IsKey(step))
- {
- // If this is not a key, the key was detected before. Use it to populate the object
- currentObject = AddProperty(currentObject, key, GetTextFromSpan(step));
- key = null;
- continue;
- }
- // Special case: Maybe we detected a new key, although the old key has not been used as property yet
- // This can happen for keys without value, so add it empty.
- currentObject = AddProperty(currentObject, key, "");
- key = GetKeyFromNode(step);
- var level = GetIndentationFromNode(step);
- if (level > currentObject.level)
- {
- // Decend to lower level: create a new child
- var child = new DataObject { level = level, Parent = currentObject };
- currentObject.Children.Add(child);
- currentObject = child;
- }
- else
- {
- currentObject = EqualizeLevel(currentObject, level);
- }
- }
- return currentObject;
- }
- private static DataObject EqualizeLevel(DataObject obj, decimal level)
- {
- while (level < obj.level)
- {
- obj = obj.Parent;
- }
- return obj;
- }
- // Go through all pages
- foreach (var page in pages)
- private static DataObject GetAddedSiblingIfKeyExists(DataObject obj, string key)
- {
- if (key == null || !obj.Properties.ContainsKey(key)) { return obj; }
- var sibling = new DataObject { level = obj.level, Parent = obj.Parent };
- obj.Parent.Children.Add(sibling);
- return sibling;
- }
- private static DataObject AddProperty(DataObject obj, string key, string value)
- {
- // Special case: <Span> which contains the page information. Skip it.
- if (key == null) return obj;
- obj.Properties.Add(key, value);
- return obj;
- }
- private static DataObject AnalyzeSpanTags(IEnumerable<HtmlNode> steps, DataObject currentObject)
- {
- string key = null;
- foreach (var step in steps)
- {
- currentObject = GetAddedSiblingIfKeyExists(currentObject, key);
- if (!IsKey(step))
- {
- .....
- /// <summary>
- /// Converts the HTML input file into JSON and writes the output file
- /// </summary>
- public void Convert()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement