Advertisement
Guest User

Untitled

a guest
Mar 5th, 2015
203
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 13.37 KB | None | 0 0
  1. using System;
  2. using System.Collections.Generic;
  3. using System.Diagnostics;
  4. using System.Globalization;
  5. using System.IO;
  6. using System.Linq;
  7. using ExCSS;
  8. using HtmlAgilityPack;
  9. using Newtonsoft.Json;
  10.  
  11. namespace QADirector
  12. {
  13. internal class HtmlToJsonConverter
  14. {
  15. private readonly FileInfo _inputFile;
  16. private readonly FileInfo _outputFile;
  17.  
  18. /// <summary>
  19. /// Creates an object that can convert a QA Director HTML export to a JSON object.
  20. /// </summary>
  21. /// <param name="source">HTML report to be converted.</param>
  22. /// <param name="destination">JSON file name to be created</param>
  23. public HtmlToJsonConverter(FileInfo source, FileInfo destination)
  24. {
  25. _outputFile = destination;
  26. _inputFile = source;
  27. }
  28.  
  29. /// <summary>
  30. /// Converts the HTML input file into JSON and writes the output file
  31. /// </summary>
  32. public void Convert()
  33. {
  34. var htmlDocument = ReadFileToHtmlDocument();
  35.  
  36. var pages = GetPages(htmlDocument);
  37. Debug.WriteLine("Found {0} pages", pages.Count());
  38.  
  39. var objects = ParseHtmlToObjects(pages);
  40. WriteJson(objects);
  41. }
  42.  
  43. /// <summary>
  44. /// Write the object as JSON using a custom converter that
  45. /// correctly writes the property list as single properties
  46. /// and not as a list.
  47. /// </summary>
  48. /// <param name="obj">Object to be serialized</param>
  49. private void WriteJson(DataObject obj)
  50. {
  51. var converter = new CustomJsonConverter();
  52. var data = JsonConvert.SerializeObject(obj, Formatting.Indented, converter);
  53. File.WriteAllText(_outputFile.FullName, data);
  54. }
  55.  
  56. /// <summary>
  57. /// Parses the HTML pages and creates an equivalent object by parsing the properties and values
  58. /// as well as the properties and values of the sub items
  59. /// </summary>
  60. /// <param name="pages">HTML pages to be analyzed. <see cref="GetPages"/>.</param>
  61. /// <returns>An object containing properties and child objects.</returns>
  62. private static DataObject ParseHtmlToObjects(IEnumerable<HtmlNode> pages)
  63. {
  64. var rootObject = new DataObject();
  65. var currentObject = rootObject;
  66. // Go through all pages
  67. foreach (var page in pages)
  68. {
  69. // Analyze all the span tags
  70. var steps = page.Descendants().Where(x => (x.Name == "span")).ToList();
  71. string key = null;
  72. foreach (var step in steps)
  73. {
  74. if (!IsKey(step))
  75. {
  76. // If this is not a key, the key was detected before. Use it to populate the object
  77. currentObject = AddProperty(currentObject, key, GetTextFromSpan(step));
  78. key = null;
  79. }
  80. else
  81. {
  82. if (key != null)
  83. {
  84. // Special case: we detected a new key, although the old key has not been used as property yet
  85. // This can happen for keys without value, so add it empty.
  86. currentObject = AddProperty(currentObject, key, "");
  87. }
  88.  
  89.  
  90. key = GetKeyFromNode(step);
  91. var level = GetIndentationFromNode(step);
  92. if (level == currentObject.level)
  93. {
  94. // Still the same level.
  95. // Remain at the same object in order to add more properties.
  96. }
  97. else if (level > currentObject.level)
  98. {
  99. // Decend to lower level: create a new child
  100. var child = new DataObject {level = level, Parent = currentObject};
  101. currentObject.Children.Add(child);
  102. currentObject = child;
  103. }
  104. else
  105. {
  106. // Move up until level is equal
  107. while (level < currentObject.level)
  108. {
  109. currentObject = currentObject.Parent;
  110. }
  111. }
  112. }
  113. }
  114. }
  115.  
  116. return rootObject;
  117. }
  118.  
  119. /// <summary>
  120. /// Adds a property to an object.
  121. /// If that property already exists, create a new object and add the property there
  122. /// </summary>
  123. /// <param name="obj">Object to add the property to</param>
  124. /// <param name="key">Name of the property</param>
  125. /// <param name="value">Value of the property</param>
  126. /// <returns>The same object if the property did not exist yet, the new object if the property already existed</returns>
  127. private static DataObject AddProperty(DataObject obj, string key, string value)
  128. {
  129. // Special case: <Span> which contains the page information. Skip it.
  130. if (key == null) return obj;
  131.  
  132. if (obj.Properties.ContainsKey(key))
  133. {
  134. // This key was already assigned, so this must be a new object on the same level
  135. var sibling = new DataObject {level = obj.level, Parent = obj.Parent};
  136. obj.Parent.Children.Add(sibling);
  137. obj = sibling;
  138. }
  139.  
  140. obj.Properties.Add(key, value);
  141. return obj;
  142. }
  143.  
  144. /// <summary>
  145. /// Gets the indentation from the HTML node.
  146. /// Detection is done via the left CSS style of the HTML element.
  147. /// </summary>
  148. /// <param name="step">HTML node to analyze</param>
  149. /// <returns>Indentation value, 0 if no left attribute was found</returns>
  150. private static decimal GetIndentationFromNode(HtmlNode step)
  151. {
  152. decimal thisIndent=0;
  153. var styleSheet = ExtractStyle(step);
  154. foreach (var cssAttribute in styleSheet.StyleRules[0].Declarations)
  155. {
  156. if (cssAttribute.Name != "left") continue;
  157.  
  158. var numberFormatInfo = new NumberFormatInfo {NumberDecimalSeparator = "."};
  159. thisIndent = decimal.Parse(cssAttribute.Term.ToString().Replace("in", ""), numberFormatInfo);
  160. Debug.WriteLine("Left attribute found:" + cssAttribute.Term);
  161. }
  162. return thisIndent;
  163. }
  164.  
  165. /// <summary>
  166. /// Check if the node contains a key and if so, return its text.
  167. /// </summary>
  168. /// <param name="span">HTML SPAN node to be analyzed</param>
  169. /// <returns>Text of the key if key was detected, <c>null</c> otherwise.
  170. /// Text is cleaned from HTML entities and has the trailing colon removed.</returns>
  171. private static string GetKeyFromNode(HtmlNode span)
  172. {
  173. return IsKey(span) ? GetTextFromSpan(span).Trim(':') : null;
  174. }
  175.  
  176. /// <summary>
  177. /// Check if the node contains a key.
  178. /// Detection is done via the <c>font-weight="bold"</c> CSS style of the HTML element.
  179. /// </summary>
  180. /// <param name="span">HTML SPAN node to be analyzed</param>
  181. /// <returns><c>True</c> if font-weight was bold, <c>false</c> otherwise.</returns>
  182. private static bool IsKey(HtmlNode span)
  183. {
  184. var styleSheet = ExtractStyle(span);
  185. foreach (var cssAttribute in styleSheet.StyleRules[0].Declarations)
  186. {
  187. if (cssAttribute.Name != "font-weight" || cssAttribute.Term.ToString() != "bold") continue;
  188. return true;
  189. }
  190. return false;
  191. }
  192.  
  193. /// <summary>
  194. /// Gets the text from a HTML SPAN element.
  195. /// This includes removal of the <c>NOBR</c> tag and decoding of HTML special characters.
  196. /// </summary>
  197. /// <param name="span">SPAN element to get the text from.</param>
  198. /// <returns>Inner text of the SPAN node</returns>
  199. private static string GetTextFromSpan(HtmlNode span)
  200. {
  201. var text = span.Descendants("nobr").First().InnerText; // There is only one <nobr>
  202. text = System.Net.WebUtility.HtmlDecode(text);
  203. return text;
  204. }
  205.  
  206. /// <summary>
  207. /// Extracts the information of the <c>style</c> attribute
  208. /// </summary>
  209. /// <param name="element">HTML element to extract the style information from</param>
  210. /// <returns></returns>
  211. private static StyleSheet ExtractStyle(HtmlNode element)
  212. {
  213. var rawStyle = element.Attributes["style"].Value;
  214. var styleSheet = new Parser().Parse(String.Format(".dummy{{{0}}}", rawStyle));
  215. return styleSheet;
  216. }
  217.  
  218. /// <summary>
  219. /// Gets the pages, identified by a DIV element defining the width and height
  220. /// inside another DIV element
  221. /// (the one with style="page-break-inside:avoid;page-break-after:always;")
  222. /// </summary>
  223. /// <param name="htmlDocument">HTML to analyze</param>
  224. /// <returns>DIV nodes corresponding to pages, using the inner of the described DIVs (the one defining width and height)</returns>
  225. private static IEnumerable<HtmlNode> GetPages(HtmlDocument htmlDocument)
  226. {
  227. var pages =
  228. htmlDocument.DocumentNode.Descendants()
  229. .Where(x =>(x.Name == "div" && x.Ancestors("div").Count() == 1));
  230. return pages;
  231. }
  232.  
  233. /// <summary>
  234. /// Reads the file contents from disk and converts it into a HTML document
  235. /// </summary>
  236. /// <returns>HTML document as read from disk</returns>
  237. private HtmlDocument ReadFileToHtmlDocument()
  238. {
  239. var source = File.ReadAllText(_inputFile.FullName);
  240. var html = new HtmlDocument();
  241. html.LoadHtml(source);
  242. return html;
  243. }
  244. }
  245. }
  246.  
  247. foreach (var cssAttribute in styleSheet.StyleRules[0].Declarations)
  248. {
  249. if (cssAttribute.Name != "font-weight" || cssAttribute.Term.ToString() != "bold") continue;
  250. return true;
  251. }
  252. return false;
  253.  
  254. foreach (var cssAttribute in styleSheet.StyleRules[0].Declarations)
  255. {
  256. if (cssAttribute.Name == "font-weight" && cssAttribute.Term.ToString() == "bold")
  257. {
  258. return true;
  259. }
  260. }
  261. return false;
  262.  
  263. private static bool IsKey(HtmlNode span)
  264. {
  265. var styleSheet = ExtractStyle(span);
  266. bool isKey = styleSheet.StyleRules[0].Declarations
  267. .Any(cssAttribute.Name == "font-weight" && cssAttribute.Term.ToString() == "bold");
  268.  
  269. return isKey;
  270. }
  271.  
  272. private static DataObject ParseHtmlToObjects(IEnumerable<HtmlNode> pages)
  273. {
  274. var rootObject = new DataObject();
  275. var currentObject = rootObject;
  276.  
  277. foreach (var page in pages)
  278. {
  279. var steps = page.Descendants().Where(x => (x.Name == "span")).ToList();
  280. currentObject = AnalyzeSpanTags(steps, currentObject);
  281. }
  282.  
  283. return rootObject;
  284. }
  285.  
  286. private static DataObject AnalyzeSpanTags(IEnumerable<HtmlNode> steps, DataObject currentObject)
  287. {
  288. string key = null;
  289. foreach (var step in steps)
  290. {
  291. if (!IsKey(step))
  292. {
  293. // If this is not a key, the key was detected before. Use it to populate the object
  294. currentObject = AddProperty(currentObject, key, GetTextFromSpan(step));
  295. key = null;
  296. continue;
  297. }
  298.  
  299. // Special case: Maybe we detected a new key, although the old key has not been used as property yet
  300. // This can happen for keys without value, so add it empty.
  301. currentObject = AddProperty(currentObject, key, "");
  302.  
  303. key = GetKeyFromNode(step);
  304. var level = GetIndentationFromNode(step);
  305.  
  306. if (level > currentObject.level)
  307. {
  308. // Decend to lower level: create a new child
  309. var child = new DataObject { level = level, Parent = currentObject };
  310. currentObject.Children.Add(child);
  311. currentObject = child;
  312. }
  313. else
  314. {
  315. currentObject = EqualizeLevel(currentObject, level);
  316. }
  317. }
  318. return currentObject;
  319. }
  320.  
  321. private static DataObject EqualizeLevel(DataObject obj, decimal level)
  322. {
  323. while (level < obj.level)
  324. {
  325. obj = obj.Parent;
  326. }
  327. return obj;
  328. }
  329.  
  330. // Go through all pages
  331. foreach (var page in pages)
  332.  
  333. private static DataObject GetAddedSiblingIfKeyExists(DataObject obj, string key)
  334. {
  335. if (key == null || !obj.Properties.ContainsKey(key)) { return obj; }
  336.  
  337. var sibling = new DataObject { level = obj.level, Parent = obj.Parent };
  338. obj.Parent.Children.Add(sibling);
  339. return sibling;
  340. }
  341.  
  342. private static DataObject AddProperty(DataObject obj, string key, string value)
  343. {
  344. // Special case: <Span> which contains the page information. Skip it.
  345. if (key == null) return obj;
  346.  
  347. obj.Properties.Add(key, value);
  348. return obj;
  349. }
  350.  
  351. private static DataObject AnalyzeSpanTags(IEnumerable<HtmlNode> steps, DataObject currentObject)
  352. {
  353. string key = null;
  354. foreach (var step in steps)
  355. {
  356. currentObject = GetAddedSiblingIfKeyExists(currentObject, key);
  357.  
  358. if (!IsKey(step))
  359. {
  360. .....
  361.  
  362. /// <summary>
  363. /// Converts the HTML input file into JSON and writes the output file
  364. /// </summary>
  365. public void Convert()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement