Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- /**
- * Gets the title of webpages
- * @param none
- *
- * @return the title
- */
- private void getTitle()
- {
- String html = "";
- String tmp = "";
- String charSet = "UTF-8"; //this is later overwritten
- Pattern pTitle = Pattern.compile("<title>(.*?)</title>");
- Pattern pCharSet = Pattern.compile("charset=(.*?)\"");
- try
- {
- //sets the url
- URL url = new URL(line);
- //opens the connection
- URLConnection urlConnection = url.openConnection();
- //gets the input stream and stores it
- //DataInputStream dis = new DataInputStream(urlConnection.getInputStream());
- BufferedReader d = new BufferedReader(
- new InputStreamReader(
- urlConnection.getInputStream()));
- charSet = new InputStreamReader(
- urlConnection.getInputStream()
- ).getEncoding();
- //String content = urlConnection.getContentEncoding();
- //Map Content = urlConnection.getHeaderFields();
- //the html string to store it
- //tmp = d.readLine();
- //html = d.readLine();
- //add all the lines to a String
- while ((tmp = d.readLine()) != null)
- {
- html += " " + tmp;
- //replace all the \s[ \t\n\x0B\f\r] with an whitespace
- html = html.replaceAll("\\s+", " ");
- Matcher mCharSet = pCharSet.matcher(html);
- if(mCharSet.find() == true)
- {
- System.out.println("CharSet Match: "+ mCharSet.group(1));
- charSet = mCharSet.group(1);
- break;
- }
- }
- //opens the connection again
- urlConnection = url.openConnection();
- //tmp = new String(tmp.getBytes(charSet), charSet);
- d = new BufferedReader(
- new InputStreamReader(
- urlConnection.getInputStream(), charSet));
- while ((tmp = d.readLine()) != null)
- {
- html += " " + tmp;
- Matcher mTitle = pTitle.matcher(html);
- if(mTitle.find() == true)
- {
- //replace all the \s[ \t\n\x0B\f\r] with an whitespace
- mTitle.group(1).replaceAll("\\s+", " ");
- // choose an encoding
- //Charset cs = Charset.forName( charSet );
- // for byte to char
- //CharsetDecoder decoder = cs.newDecoder();
- // for char to byte
- //CharsetEncoder encoder = cs.newEncoder();
- // Presuming you have ByteBuffers and CharBuffer objects
- // as a side effect of doing nio-style i/o.
- //CharBuffer ss = CharBuffer.wrap( mTitle.group(1) );
- // effectively convert byte[] to char[] after a read
- //CharBuffer charBuffer = decoder.decode( ss );
- // effectively convert char[] to byte[] before a write
- //ByteBuffer byteBuffer = encoder.encode( charBuffer );
- //System.out.println("LINE_0: "+m.group(0));
- /*
- System.out.println("LINE_1: "+tmp);
- String test = "";
- test = new String(tmp.getBytes(), charSet);
- System.out.println("TEST_1: "+ test);
- test = new String(tmp.getBytes(), "Shift-JIS");
- System.out.println("TEST_2: "+ test);
- */
- System.out.println("LINE_1: "+mTitle.group(1));
- anArrayOfStrings[0] = mTitle.group(1) + " 00";
- String test = "";
- test = new String(mTitle.group(1).getBytes(), charSet);
- System.out.println("TEST_1: "+ test);
- anArrayOfStrings[1] = test + " 01";
- test = new String(mTitle.group(1).getBytes(), "UTF8");
- System.out.println("TEST_2: "+ test);
- anArrayOfStrings[2] = test + " 02";
- test = new String(mTitle.group(1).getBytes(), "Shift_JIS");
- System.out.println("TEST_3: "+ test);
- anArrayOfStrings[3] = test + " 03";
- test = new String(mTitle.group(1).getBytes(), "UTF-16");
- System.out.println("TEST_4: "+ test);
- anArrayOfStrings[4] = test + " 04";
- test = new String(mTitle.group(1).getBytes(), "UTF-8");
- System.out.println("TEST_5: "+ test);
- anArrayOfStrings[5] = test + " 05";
- test = new String(mTitle.group(1).getBytes(), "ISO-8859-1");
- System.out.println("TEST_6: "+ test);
- anArrayOfStrings[6] = test + " 06";
- test = new String(mTitle.group(1).getBytes(), "US-ASCII");
- System.out.println("TEST_7: "+ test);
- anArrayOfStrings[7] = test + " 07";
- test = new String(mTitle.group(1).getBytes(), "Windows-1250");
- System.out.println("TEST_8: "+ test);
- anArrayOfStrings[8] = test + " 08";
- test = new String(mTitle.group(1).getBytes(), "ISO-8859-15");
- System.out.println("TEST_9: "+ test);
- anArrayOfStrings[9] = test + " 09";
- // setLine(mTitle.group(1), charSet);
- setLine(new String(mTitle.group(1).getBytes(), charSet));
- break;
- }
- }
- //close the stream
- //dis.close();
- //replace all the \s[ \t\n\x0B\f\r] with an whitespace
- //html = html.replaceAll("\\s+", " ");
- //set a pattern to match against
- //Pattern p = Pattern.compile("<title>(.*?)</title>");
- //matches
- //Matcher m = p.matcher(html);
- //if(m.find() == true)
- //{
- //System.out.println("LINE_0: "+m.group(0));
- //System.out.println("LINE_1: "+m.group(1));
- //setLine(m.group(1));
- //}
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement