SHOW:
|
|
- or go back to the newest paste.
1 | using System; | |
2 | using System.Collections.Generic; | |
3 | using System.Linq; | |
4 | using System.Text; | |
5 | using System.Net; | |
6 | using System.IO; | |
7 | using HtmlAgilityPack; | |
8 | using System.Xml.Serialization; | |
9 | using System.Runtime.Serialization.Formatters.Binary; | |
10 | using System.Runtime.Serialization; | |
11 | ||
12 | namespace habrJob | |
13 | { | |
14 | public enum Education | |
15 | { | |
16 | None, | |
17 | Higher, | |
18 | IncompleteHigher, | |
19 | SecondaryVocational, | |
20 | Secondary, | |
21 | Pupil | |
22 | } | |
23 | ||
24 | public enum Employment | |
25 | { | |
26 | Full, | |
27 | Partial, | |
28 | Freelance | |
29 | } | |
30 | ||
31 | public class HabraJob | |
32 | { | |
33 | public HabraJob() | |
34 | { | |
35 | } | |
36 | ||
37 | private string _price = string.Empty; | |
38 | ||
39 | public string Url { get; set; } | |
40 | ||
41 | public string Title { get; set; } | |
42 | ||
43 | public string Price | |
44 | { | |
45 | get | |
46 | { | |
47 | return _price; | |
48 | } | |
49 | set | |
50 | { | |
51 | _price = value; | |
52 | if (_price != "з/п договорная") | |
53 | { | |
54 | PriceMoney = int.Parse(value.Substring(3).Replace(" ", "")); | |
55 | } | |
56 | } | |
57 | } | |
58 | public int PriceMoney { get; set; } | |
59 | ||
60 | public string Company { get; set; } | |
61 | ||
62 | public string Country { get; set; } | |
63 | public string Region { get; set; } | |
64 | public string City { get; set; } | |
65 | ||
66 | public Education Education { get; set; } | |
67 | ||
68 | public Employment Employment { get; set; } | |
69 | ||
70 | public static Education ParseEducation(string education) | |
71 | { | |
72 | switch (education) | |
73 | { | |
74 | case "Высшее": | |
75 | return Education.Higher; | |
76 | case "Неполное высшее": | |
77 | return Education.IncompleteHigher; | |
78 | case "Среднее специальное": | |
79 | return Education.SecondaryVocational; | |
80 | case "Среднее": | |
81 | return Education.Secondary; | |
82 | case "Учащийся": | |
83 | return Education.Pupil; | |
84 | case "Не имеет значения": | |
85 | return Education.None; | |
86 | default: | |
87 | throw new Exception(); | |
88 | } | |
89 | } | |
90 | ||
91 | public static Employment ParseEmployment(string employment) | |
92 | { | |
93 | switch (employment) | |
94 | { | |
95 | case "полная": | |
96 | return Employment.Full; | |
97 | case "частичная": | |
98 | return Employment.Partial; | |
99 | case "фриланс": | |
100 | return Employment.Freelance; | |
101 | default: | |
102 | throw new Exception(); | |
103 | } | |
104 | } | |
105 | } | |
106 | ||
107 | class Program | |
108 | { | |
109 | public static WebClient wClient; | |
110 | public static WebRequest request; | |
111 | public static WebResponse response; | |
112 | ||
113 | public static List<HabraJob> jobList; | |
114 | ||
115 | public static Encoding encode = System.Text.Encoding.GetEncoding("utf-8"); | |
116 | ||
117 | static int GetPagesCount(HtmlDocument html) | |
118 | { | |
119 | var liNodes = html.GetElementbyId("nav-pages").ChildNodes.Where(x => x.Name == "li"); | |
120 | ||
121 | HtmlAttribute href = liNodes.Last().FirstChild.Attributes["href"]; | |
122 | ||
123 | int pagesCount = (int)Char.GetNumericValue(href.Value[href.Value.Length-2]); | |
124 | ||
125 | return pagesCount; | |
126 | } | |
127 | ||
128 | static void GetJobLinks(HtmlDocument html) | |
129 | { | |
130 | var trNodes = html.GetElementbyId("job-items").ChildNodes.Where(x => x.Name == "tr"); | |
131 | ||
132 | foreach (var item in trNodes) | |
133 | { | |
134 | var tdNodes = item.ChildNodes.Where(x => x.Name == "td").ToArray(); | |
135 | if (tdNodes.Count() != 0) | |
136 | { | |
137 | var location = tdNodes[2].ChildNodes.Where(x => x.Name == "a").ToArray(); | |
138 | ||
139 | jobList.Add(new HabraJob() | |
140 | { | |
141 | Url = tdNodes[0].ChildNodes.First().Attributes["href"].Value, | |
142 | Title = tdNodes[0].FirstChild.InnerText, | |
143 | Price = tdNodes[1].FirstChild.InnerText, | |
144 | Country = location[0].InnerText, | |
145 | Region = location[2].InnerText, | |
146 | City = location[2].InnerText | |
147 | }); | |
148 | } | |
149 | ||
150 | } | |
151 | } | |
152 | ||
153 | static void GetFullInfo(HabraJob job) | |
154 | { | |
155 | HtmlDocument html = new HtmlDocument(); | |
156 | // html.LoadHtml(wClient.DownloadString(job.Url)); | |
157 | html.LoadHtml(GetHtmlString(job.Url)); | |
158 | ||
159 | // так делать нельзя :-( | |
160 | var table = html.GetElementbyId("main-content").ChildNodes[1].ChildNodes[9].ChildNodes[1].ChildNodes[2].ChildNodes[1].ChildNodes[3].ChildNodes.Where(x => x.Name == "tr").ToArray(); | |
161 | ||
162 | foreach (var tr in table) | |
163 | { | |
164 | string category = tr.ChildNodes.FindFirst("th").InnerText; | |
165 | ||
166 | switch (category) | |
167 | { | |
168 | case "Компания": | |
169 | job.Company = tr.ChildNodes.FindFirst("td").FirstChild.InnerText; | |
170 | break; | |
171 | case "Образование:": | |
172 | job.Education = HabraJob.ParseEducation(tr.ChildNodes.FindFirst("td").InnerText); | |
173 | break; | |
174 | case "Занятость:": | |
175 | job.Employment = HabraJob.ParseEmployment(tr.ChildNodes.FindFirst("td").InnerText); | |
176 | break; | |
177 | default: | |
178 | continue; | |
179 | } | |
180 | } | |
181 | } | |
182 | ||
183 | public static string GetHtmlString(string url) | |
184 | { | |
185 | request = WebRequest.Create(url); | |
186 | request.Proxy = null; | |
187 | response = request.GetResponse(); | |
188 | using (StreamReader sReader = new StreamReader(response.GetResponseStream(), encode)) | |
189 | { | |
190 | return sReader.ReadToEnd(); | |
191 | } | |
192 | } | |
193 | ||
194 | public static void SerializeToXml(List<HabraJob> jobList) | |
195 | { | |
196 | using (TextWriter output = new StreamWriter("report.xml")) | |
197 | { | |
198 | XmlSerializer serializer = new XmlSerializer(typeof(List<HabraJob>)); | |
199 | serializer.Serialize(output, jobList); | |
200 | } | |
201 | } | |
202 | ||
203 | static void Main(string[] args) | |
204 | { | |
205 | jobList = new List<HabraJob>(); | |
206 | wClient = new WebClient(); | |
207 | ||
208 | wClient.Proxy = null; | |
209 | wClient.Encoding = encode; | |
210 | ||
211 | HtmlDocument html = new HtmlDocument(); | |
212 | ||
213 | html.LoadHtml(wClient.DownloadString("http://habr.ru/job")); | |
214 | GetJobLinks(html); | |
215 | ||
216 | int pagesCount = GetPagesCount(html); | |
217 | ||
218 | for (int i = 2; i <= pagesCount; i++) | |
219 | { | |
220 | html.LoadHtml(wClient.DownloadString(string.Format("http://habrahabr.ru/job/page{0}", i))); | |
221 | GetJobLinks(html); | |
222 | } | |
223 | ||
224 | foreach (var job in jobList) | |
225 | { | |
226 | GetFullInfo(job); | |
227 | } | |
228 | ||
229 | SerializeToXml(jobList); | |
230 | } | |
231 | } | |
232 | } |