Advertisement
Guest User

selenium/HttpResponse.java

a guest
Jun 2nd, 2016
242
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 12.78 KB | None | 0 0
  1. /**
  2. * Licensed to the Apache Software Foundation (ASF) under one or more
  3. * contributor license agreements. See the NOTICE file distributed with
  4. * this work for additional information regarding copyright ownership.
  5. * The ASF licenses this file to You under the Apache License, Version 2.0
  6. * (the "License"); you may not use this file except in compliance with
  7. * the License. You may obtain a copy of the License at
  8. *
  9. * http://www.apache.org/licenses/LICENSE-2.0
  10. *
  11. * Unless required by applicable law or agreed to in writing, software
  12. * distributed under the License is distributed on an "AS IS" BASIS,
  13. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. * See the License for the specific language governing permissions and
  15. * limitations under the License.
  16. */
  17. package org.apache.nutch.protocol.selenium;
  18.  
  19. // JDK imports
  20. import java.io.BufferedInputStream;
  21. import java.io.EOFException;
  22. import java.io.IOException;
  23. import java.io.InputStream;
  24. import java.io.OutputStream;
  25. import java.io.ByteArrayOutputStream;
  26. import java.io.PushbackInputStream;
  27. import java.net.InetSocketAddress;
  28. import java.net.Socket;
  29. import java.net.URL;
  30. import java.util.Arrays;
  31. import java.util.HashSet;
  32. import java.util.Set;
  33.  
  34. import javax.net.ssl.SSLSocket;
  35. import javax.net.ssl.SSLSocketFactory;
  36.  
  37.  
  38. import org.apache.hadoop.conf.Configuration;
  39. import org.apache.nutch.crawl.CrawlDatum;
  40. import org.apache.nutch.metadata.Metadata;
  41. import org.apache.nutch.metadata.SpellCheckedMetadata;
  42. import org.apache.nutch.net.protocols.HttpDateFormat;
  43. import org.apache.nutch.net.protocols.Response;
  44. import org.apache.nutch.protocol.ProtocolException;
  45. import org.apache.nutch.protocol.http.api.HttpException;
  46. import org.apache.nutch.protocol.http.api.HttpBase;
  47.  
  48. /* Most of this code was borrowed from protocol-htmlunit; which in turn borrowed it from protocol-httpclient */
  49.  
  50. public class HttpResponse implements Response {
  51.  
  52. private Http http;
  53. private URL url;
  54. private String orig;
  55. private String base;
  56. private byte[] content;
  57. private int code;
  58. private Metadata headers = new SpellCheckedMetadata();
  59.  
  60. /** The nutch configuration */
  61. private Configuration conf = null;
  62.  
  63. protected enum Scheme {
  64. HTTP, HTTPS,
  65. }
  66.  
  67. public HttpResponse(Http http, URL url, CrawlDatum datum) throws ProtocolException, IOException {
  68.  
  69. this.conf = http.getConf();
  70. this.http = http;
  71. this.url = url;
  72. this.orig = url.toString();
  73. this.base = url.toString();
  74.  
  75. Scheme scheme = null;
  76.  
  77. if ("http".equals(url.getProtocol())) {
  78. scheme = Scheme.HTTP;
  79. } else if ("https".equals(url.getProtocol())) {
  80. scheme = Scheme.HTTPS;
  81. } else {
  82. throw new HttpException("Unknown scheme (not http/https) for url:" + url);
  83. }
  84.  
  85. if (Http.LOG.isTraceEnabled()) {
  86. Http.LOG.trace("fetching " + url);
  87. }
  88.  
  89. String path = "".equals(url.getFile()) ? "/" : url.getFile();
  90.  
  91. // some servers will redirect a request with a host line like
  92. // "Host: <hostname>:80" to "http://<hpstname>/<orig_path>"- they
  93. // don't want the :80...
  94.  
  95. String host = url.getHost();
  96. int port;
  97. String portString;
  98. if (url.getPort() == -1) {
  99. if (scheme == Scheme.HTTP) {
  100. port = 80;
  101. } else {
  102. port = 443;
  103. }
  104. portString = "";
  105. } else {
  106. port = url.getPort();
  107. portString = ":" + port;
  108. }
  109. Socket socket = null;
  110.  
  111. try {
  112. socket = new Socket(); // create the socket
  113. socket.setSoTimeout(http.getTimeout());
  114.  
  115. // connect
  116. String sockHost = http.useProxy(url) ? http.getProxyHost() : host;
  117. int sockPort = http.useProxy(url) ? http.getProxyPort() : port;
  118. InetSocketAddress sockAddr = new InetSocketAddress(sockHost, sockPort);
  119. socket.connect(sockAddr, http.getTimeout());
  120.  
  121. if (scheme == Scheme.HTTPS) {
  122. SSLSocketFactory factory = (SSLSocketFactory) SSLSocketFactory
  123. .getDefault();
  124. SSLSocket sslsocket = (SSLSocket) factory
  125. .createSocket(socket, sockHost, sockPort, true);
  126. sslsocket.setUseClientMode(true);
  127.  
  128. // Get the protocols and ciphers supported by this JVM
  129. Set<String> protocols = new HashSet<String>(
  130. Arrays.asList(sslsocket.getSupportedProtocols()));
  131. Set<String> ciphers = new HashSet<String>(
  132. Arrays.asList(sslsocket.getSupportedCipherSuites()));
  133.  
  134. // Intersect with preferred protocols and ciphers
  135. protocols.retainAll(http.getTlsPreferredProtocols());
  136. ciphers.retainAll(http.getTlsPreferredCipherSuites());
  137.  
  138. sslsocket.setEnabledProtocols(
  139. protocols.toArray(new String[protocols.size()]));
  140. sslsocket.setEnabledCipherSuites(
  141. ciphers.toArray(new String[ciphers.size()]));
  142.  
  143. sslsocket.startHandshake();
  144. socket = sslsocket;
  145. }
  146.  
  147.  
  148.  
  149. // make request
  150. OutputStream req = socket.getOutputStream();
  151.  
  152. StringBuffer reqStr = new StringBuffer("GET ");
  153. if (http.useProxy(url)) {
  154. reqStr.append(url.getProtocol() + "://" + host + portString + path);
  155. } else {
  156. reqStr.append(path);
  157. }
  158.  
  159. reqStr.append(" HTTP/1.0\r\n");
  160.  
  161. reqStr.append("Host: ");
  162. reqStr.append(host);
  163. reqStr.append(portString);
  164. reqStr.append("\r\n");
  165.  
  166. reqStr.append("Accept-Encoding: x-gzip, gzip, deflate\r\n");
  167.  
  168. String userAgent = http.getUserAgent();
  169. if ((userAgent == null) || (userAgent.length() == 0)) {
  170. if (Http.LOG.isErrorEnabled()) {
  171. Http.LOG.error("User-agent is not set!");
  172. }
  173. } else {
  174. reqStr.append("User-Agent: ");
  175. reqStr.append(userAgent);
  176. reqStr.append("\r\n");
  177. }
  178.  
  179. reqStr.append("Accept-Language: ");
  180. reqStr.append(this.http.getAcceptLanguage());
  181. reqStr.append("\r\n");
  182.  
  183. reqStr.append("Accept: ");
  184. reqStr.append(this.http.getAccept());
  185. reqStr.append("\r\n");
  186.  
  187. if (datum.getModifiedTime() > 0) {
  188. reqStr.append("If-Modified-Since: " + HttpDateFormat.toString(datum.getModifiedTime()));
  189. reqStr.append("\r\n");
  190. }
  191. reqStr.append("\r\n");
  192.  
  193. byte[] reqBytes = reqStr.toString().getBytes();
  194.  
  195. req.write(reqBytes);
  196. req.flush();
  197.  
  198. PushbackInputStream in = // process response
  199. new PushbackInputStream(new BufferedInputStream(socket.getInputStream(), Http.BUFFER_SIZE),
  200. Http.BUFFER_SIZE);
  201.  
  202. StringBuffer line = new StringBuffer();
  203.  
  204. boolean haveSeenNonContinueStatus = false;
  205. while (!haveSeenNonContinueStatus) {
  206. // parse status code line
  207. this.code = parseStatusLine(in, line);
  208. // parse headers
  209. parseHeaders(in, line);
  210. haveSeenNonContinueStatus = code != 100; // 100 is "Continue"
  211. }
  212.  
  213. // Get Content type header
  214. String contentType = getHeader(Response.CONTENT_TYPE);
  215.  
  216. // handle with Selenium only if content type in HTML or XHTML
  217. if (contentType != null) {
  218. if (contentType.contains("text/html") || contentType.contains("application/xhtml")) {
  219. readPlainContent(url);
  220. } else {
  221. try {
  222. int contentLength = Integer.MAX_VALUE;
  223. String contentLengthString = headers.get(Response.CONTENT_LENGTH);
  224. if (contentLengthString != null) {
  225. try {
  226. contentLength = Integer.parseInt(contentLengthString.trim());
  227. } catch (NumberFormatException ex) {
  228. throw new HttpException("bad content length: " + contentLengthString);
  229. }
  230. }
  231.  
  232. if (http.getMaxContent() >= 0 && contentLength > http.getMaxContent()) {
  233. contentLength = http.getMaxContent();
  234. }
  235.  
  236. byte[] buffer = new byte[HttpBase.BUFFER_SIZE];
  237. int bufferFilled = 0;
  238. int totalRead = 0;
  239. ByteArrayOutputStream out = new ByteArrayOutputStream();
  240. while ((bufferFilled = in.read(buffer, 0, buffer.length)) != -1
  241. && totalRead + bufferFilled <= contentLength) {
  242. totalRead += bufferFilled;
  243. out.write(buffer, 0, bufferFilled);
  244. }
  245.  
  246. content = out.toByteArray();
  247.  
  248. } catch (Exception e) {
  249. if (code == 200)
  250. throw new IOException(e.toString());
  251. // for codes other than 200 OK, we are fine with empty content
  252. } finally {
  253. if (in != null) {
  254. in.close();
  255. }
  256. }
  257. }
  258. }
  259.  
  260. } finally {
  261. if (socket != null)
  262. socket.close();
  263. }
  264. }
  265.  
  266. /* ------------------------- *
  267. * <implementation:Response> *
  268. * ------------------------- */
  269.  
  270. public URL getUrl() {
  271. return url;
  272. }
  273.  
  274. public int getCode() {
  275. return code;
  276. }
  277.  
  278. public String getHeader(String name) {
  279. return headers.get(name);
  280. }
  281.  
  282. public Metadata getHeaders() {
  283. return headers;
  284. }
  285.  
  286. public byte[] getContent() {
  287. return content;
  288. }
  289.  
  290. /* ------------------------- *
  291. * <implementation:Response> *
  292. * ------------------------- */
  293.  
  294. private void readPlainContent(URL url) throws IOException {
  295. String page = HttpWebClient.getHtmlPage(url.toString(), conf);
  296.  
  297. content = page.getBytes("UTF-8");
  298. }
  299.  
  300. private int parseStatusLine(PushbackInputStream in, StringBuffer line) throws IOException, HttpException {
  301. readLine(in, line, false);
  302.  
  303. int codeStart = line.indexOf(" ");
  304. int codeEnd = line.indexOf(" ", codeStart + 1);
  305.  
  306. // handle lines with no plaintext result code, ie:
  307. // "HTTP/1.1 200" vs "HTTP/1.1 200 OK"
  308. if (codeEnd == -1)
  309. codeEnd = line.length();
  310.  
  311. int code;
  312. try {
  313. code = Integer.parseInt(line.substring(codeStart + 1, codeEnd));
  314. } catch (NumberFormatException e) {
  315. throw new HttpException("bad status line '" + line + "': " + e.getMessage(), e);
  316. }
  317.  
  318. return code;
  319. }
  320.  
  321. private void processHeaderLine(StringBuffer line) throws IOException, HttpException {
  322.  
  323. int colonIndex = line.indexOf(":"); // key is up to colon
  324. if (colonIndex == -1) {
  325. int i;
  326. for (i = 0; i < line.length(); i++)
  327. if (!Character.isWhitespace(line.charAt(i)))
  328. break;
  329. if (i == line.length())
  330. return;
  331. throw new HttpException("No colon in header:" + line);
  332. }
  333. String key = line.substring(0, colonIndex);
  334.  
  335. int valueStart = colonIndex + 1; // skip whitespace
  336. while (valueStart < line.length()) {
  337. int c = line.charAt(valueStart);
  338. if (c != ' ' && c != '\t')
  339. break;
  340. valueStart++;
  341. }
  342. String value = line.substring(valueStart);
  343. headers.set(key, value);
  344. }
  345.  
  346. // Adds headers to our headers Metadata
  347. private void parseHeaders(PushbackInputStream in, StringBuffer line) throws IOException, HttpException {
  348.  
  349. while (readLine(in, line, true) != 0) {
  350.  
  351. // handle HTTP responses with missing blank line after headers
  352. int pos;
  353. if (((pos = line.indexOf("<!DOCTYPE")) != -1) || ((pos = line.indexOf("<HTML")) != -1)
  354. || ((pos = line.indexOf("<html")) != -1)) {
  355.  
  356. in.unread(line.substring(pos).getBytes("UTF-8"));
  357. line.setLength(pos);
  358.  
  359. try {
  360. //TODO: (CM) We don't know the header names here
  361. //since we're just handling them generically. It would
  362. //be nice to provide some sort of mapping function here
  363. //for the returned header names to the standard metadata
  364. //names in the ParseData class
  365. processHeaderLine(line);
  366. } catch (Exception e) {
  367. // fixme:
  368. Http.LOG.warn("Error: ", e);
  369. }
  370. return;
  371. }
  372.  
  373. processHeaderLine(line);
  374. }
  375. }
  376.  
  377. private static int readLine(PushbackInputStream in, StringBuffer line, boolean allowContinuedLine)
  378. throws IOException {
  379. line.setLength(0);
  380. for (int c = in.read(); c != -1; c = in.read()) {
  381. switch (c) {
  382. case '\r':
  383. if (peek(in) == '\n') {
  384. in.read();
  385. }
  386. case '\n':
  387. if (line.length() > 0) {
  388. // at EOL -- check for continued line if the current
  389. // (possibly continued) line wasn't blank
  390. if (allowContinuedLine)
  391. switch (peek(in)) {
  392. case ' ':
  393. case '\t': // line is continued
  394. in.read();
  395. continue;
  396. }
  397. }
  398. return line.length(); // else complete
  399. default:
  400. line.append((char) c);
  401. }
  402. }
  403. throw new EOFException();
  404. }
  405.  
  406. private static int peek(PushbackInputStream in) throws IOException {
  407. int value = in.read();
  408. in.unread(value);
  409. return value;
  410. }
  411. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement