View difference between Paste ID: 27SWYLgY and RbmAD8VM
SHOW: | | - or go back to the newest paste.
1
using System;
2
using System.Collections.Generic;
3
using System.Linq;
4
using System.Text;
5
using System.Net;
6
using System.IO;
7
using HtmlAgilityPack;
8
using System.Xml.Serialization;
9
using System.Runtime.Serialization.Formatters.Binary;
10
using System.Runtime.Serialization;
11
12
namespace habrJob
13
{
14
    public enum Education
15
    {
16
        None,
17
        Higher,
18
        IncompleteHigher,
19
        SecondaryVocational,
20
        Secondary,
21
        Pupil
22
    }
23
24
    public enum Employment
25
    {
26
        Full,
27
        Partial,
28
        Freelance
29
    }
30
31
    public class HabraJob
32
    {
33
        public HabraJob()
34
        {
35
        }
36
37
        private string _price = string.Empty;
38
39
        public string Url { get; set; }
40
41
        public string Title { get; set; }
42
43
        public string Price 
44
        { 
45
            get 
46
            {
47
                return _price;
48
            }
49
            set
50
            {
51
                _price = value;
52
                if (_price != "з/п договорная")
53
                {
54
                    PriceMoney = int.Parse(value.Substring(3).Replace(" ", ""));
55
                }
56
            }
57
        }
58
        public int PriceMoney { get; set; }
59
60
        public string Company { get; set; }
61
62
        public string Country { get; set; }
63
        public string Region { get; set; }
64
        public string City { get; set; }
65
66
        public Education Education { get; set; }
67
68
        public Employment Employment { get; set; }
69
70
        public static Education ParseEducation(string education)
71
        {
72
            switch (education)
73
	        {
74
                case "Высшее":
75
                    return Education.Higher;
76
                case "Неполное высшее":
77
                    return Education.IncompleteHigher;
78
                case "Среднее специальное":
79
                    return Education.SecondaryVocational;
80
                case "Среднее":
81
                    return Education.Secondary;
82
                case "Учащийся":
83
                    return Education.Pupil;
84
                case "Не имеет значения":
85
                    return Education.None;
86
		        default:
87
                    throw new Exception();
88
	        }
89
        }
90
91
        public static Employment ParseEmployment(string employment)
92
        {
93
            switch (employment)
94
            {
95
                case "полная":
96
                    return Employment.Full;
97
                case "частичная":
98
                    return Employment.Partial;
99
                case "фриланс":
100
                    return Employment.Freelance;
101
                default:
102
                    throw new Exception();
103
            }
104
        }
105
    }
106
107
    class Program   
108
    {
109
        public static WebClient wClient;
110
        public static WebRequest request;
111
        public static WebResponse response;
112
113
        public static List<HabraJob> jobList;
114
115
        public static Encoding encode = System.Text.Encoding.GetEncoding("utf-8");
116
117
        static int GetPagesCount(HtmlDocument html)
118
        {
119
            var liNodes = html.GetElementbyId("nav-pages").ChildNodes.Where(x => x.Name == "li");
120
121
            HtmlAttribute href = liNodes.Last().FirstChild.Attributes["href"];
122
123
            int pagesCount = (int)Char.GetNumericValue(href.Value[href.Value.Length-2]);
124
125
            return pagesCount;
126
        }
127
128
        static void GetJobLinks(HtmlDocument html)
129
        {
130
            var trNodes = html.GetElementbyId("job-items").ChildNodes.Where(x => x.Name == "tr");
131
132
            foreach (var item in trNodes)
133
            {
134
                var tdNodes = item.ChildNodes.Where(x => x.Name == "td").ToArray();
135
                if (tdNodes.Count() != 0)
136
                {
137
                    var location = tdNodes[2].ChildNodes.Where(x => x.Name == "a").ToArray();
138
139
                    jobList.Add(new HabraJob()
140
                    {
141
                        Url = tdNodes[0].ChildNodes.First().Attributes["href"].Value,
142
                        Title = tdNodes[0].FirstChild.InnerText,
143
                        Price = tdNodes[1].FirstChild.InnerText,
144
                        Country = location[0].InnerText,
145
                        Region = location[2].InnerText,
146
                        City = location[2].InnerText
147
                    });
148
                }
149
                
150
            }
151
        }
152
153
        static void GetFullInfo(HabraJob job)
154
        {
155
            HtmlDocument html = new HtmlDocument();
156
            // html.LoadHtml(wClient.DownloadString(job.Url));
157
            html.LoadHtml(GetHtmlString(job.Url));
158
159
            // так делать нельзя :-(
160
            var table = html.GetElementbyId("main-content").ChildNodes[1].ChildNodes[9].ChildNodes[1].ChildNodes[2].ChildNodes[1].ChildNodes[3].ChildNodes.Where(x => x.Name == "tr").ToArray();
161
162
            foreach (var tr in table)
163
            {
164
                string category = tr.ChildNodes.FindFirst("th").InnerText;
165
166
                switch (category)
167
                {
168
                    case "Компания":
169
                        job.Company = tr.ChildNodes.FindFirst("td").FirstChild.InnerText;
170
                        break;
171
                    case "Образование:":
172
                        job.Education = HabraJob.ParseEducation(tr.ChildNodes.FindFirst("td").InnerText);
173
                        break;
174
                    case "Занятость:":
175
                        job.Employment = HabraJob.ParseEmployment(tr.ChildNodes.FindFirst("td").InnerText);
176
                        break;
177
                    default:
178
                        continue;
179
                }
180
            }
181
        }
182
183
        public static string GetHtmlString(string url)
184
        {
185
            request = WebRequest.Create(url);
186
            request.Proxy = null;
187
            response = request.GetResponse();
188
            using (StreamReader sReader = new StreamReader(response.GetResponseStream(), encode))
189
            {
190
                return sReader.ReadToEnd();                
191
            }
192
        }
193
194
        public static void SerializeToXml(List<HabraJob> jobList)
195
        {
196
            using (TextWriter output = new StreamWriter("report.xml"))
197
            {
198
                XmlSerializer serializer = new XmlSerializer(typeof(List<HabraJob>));
199
                serializer.Serialize(output, jobList);
200
            }
201
        }
202
203
        static void Main(string[] args)
204
        {
205
            jobList = new List<HabraJob>();
206
            wClient = new WebClient();
207
            
208
            wClient.Proxy = null;
209
            wClient.Encoding = encode;
210
211
            HtmlDocument html = new HtmlDocument();
212
213
            html.LoadHtml(wClient.DownloadString("http://habr.ru/job"));
214
            GetJobLinks(html);
215
216
            int pagesCount = GetPagesCount(html);
217
218
            for (int i = 2; i <= pagesCount; i++)
219
            {
220
                html.LoadHtml(wClient.DownloadString(string.Format("http://habrahabr.ru/job/page{0}", i)));
221
                GetJobLinks(html);
222
            }
223
224
            foreach (var job in jobList)
225
            {
226
                GetFullInfo(job);
227
            }
228
229
            SerializeToXml(jobList);
230
        }
231
    }
232
}