/**
* Gets the title of webpages
* @param none
*
* @return the title
*/
private void getTitle()
{
String html = "";
String tmp = "";
String charSet = "UTF-8"; //this is later overwritten
Pattern pTitle = Pattern.compile("<title>(.*?)</title>");
Pattern pCharSet = Pattern.compile("charset=(.*?)\"");
try
{
//sets the url
URL url = new URL(line);
//opens the connection
URLConnection urlConnection = url.openConnection();
//gets the input stream and stores it
//DataInputStream dis = new DataInputStream(urlConnection.getInputStream());
BufferedReader d = new BufferedReader(
new InputStreamReader(
urlConnection.getInputStream()));
charSet = new InputStreamReader(
urlConnection.getInputStream()
).getEncoding();
//String content = urlConnection.getContentEncoding();
//Map Content = urlConnection.getHeaderFields();
//the html string to store it
//tmp = d.readLine();
//html = d.readLine();
//add all the lines to a String
while ((tmp = d.readLine()) != null)
{
html += " " + tmp;
//replace all the \s[ \t\n\x0B\f\r] with an whitespace
html = html.replaceAll("\\s+", " ");
Matcher mCharSet = pCharSet.matcher(html);
if(mCharSet.find() == true)
{
System.out.println("CharSet Match: "+ mCharSet.group(1));
charSet = mCharSet.group(1);
break;
}
}
//opens the connection again
urlConnection = url.openConnection();
//tmp = new String(tmp.getBytes(charSet), charSet);
d = new BufferedReader(
new InputStreamReader(
urlConnection.getInputStream(), charSet));
while ((tmp = d.readLine()) != null)
{
html += " " + tmp;
Matcher mTitle = pTitle.matcher(html);
if(mTitle.find() == true)
{
//replace all the \s[ \t\n\x0B\f\r] with an whitespace
mTitle.group(1).replaceAll("\\s+", " ");
// choose an encoding
//Charset cs = Charset.forName( charSet );
// for byte to char
//CharsetDecoder decoder = cs.newDecoder();
// for char to byte
//CharsetEncoder encoder = cs.newEncoder();
// Presuming you have ByteBuffers and CharBuffer objects
// as a side effect of doing nio-style i/o.
//CharBuffer ss = CharBuffer.wrap( mTitle.group(1) );
// effectively convert byte[] to char[] after a read
//CharBuffer charBuffer = decoder.decode( ss );
// effectively convert char[] to byte[] before a write
//ByteBuffer byteBuffer = encoder.encode( charBuffer );
//System.out.println("LINE_0: "+m.group(0));
/*
System.out.println("LINE_1: "+tmp);
String test = "";
test = new String(tmp.getBytes(), charSet);
System.out.println("TEST_1: "+ test);
test = new String(tmp.getBytes(), "Shift-JIS");
System.out.println("TEST_2: "+ test);
*/
System.out.println("LINE_1: "+mTitle.group(1));
anArrayOfStrings[0] = mTitle.group(1) + " 00";
String test = "";
test = new String(mTitle.group(1).getBytes(), charSet);
System.out.println("TEST_1: "+ test);
anArrayOfStrings[1] = test + " 01";
test = new String(mTitle.group(1).getBytes(), "UTF8");
System.out.println("TEST_2: "+ test);
anArrayOfStrings[2] = test + " 02";
test = new String(mTitle.group(1).getBytes(), "Shift_JIS");
System.out.println("TEST_3: "+ test);
anArrayOfStrings[3] = test + " 03";
test = new String(mTitle.group(1).getBytes(), "UTF-16");
System.out.println("TEST_4: "+ test);
anArrayOfStrings[4] = test + " 04";
test = new String(mTitle.group(1).getBytes(), "UTF-8");
System.out.println("TEST_5: "+ test);
anArrayOfStrings[5] = test + " 05";
test = new String(mTitle.group(1).getBytes(), "ISO-8859-1");
System.out.println("TEST_6: "+ test);
anArrayOfStrings[6] = test + " 06";
test = new String(mTitle.group(1).getBytes(), "US-ASCII");
System.out.println("TEST_7: "+ test);
anArrayOfStrings[7] = test + " 07";
test = new String(mTitle.group(1).getBytes(), "Windows-1250");
System.out.println("TEST_8: "+ test);
anArrayOfStrings[8] = test + " 08";
test = new String(mTitle.group(1).getBytes(), "ISO-8859-15");
System.out.println("TEST_9: "+ test);
anArrayOfStrings[9] = test + " 09";
// setLine(mTitle.group(1), charSet);
setLine(new String(mTitle.group(1).getBytes(), charSet));
break;
}
}
//close the stream
//dis.close();
//replace all the \s[ \t\n\x0B\f\r] with an whitespace
//html = html.replaceAll("\\s+", " ");
//set a pattern to match against
//Pattern p = Pattern.compile("<title>(.*?)</title>");
//matches
//Matcher m = p.matcher(html);
//if(m.find() == true)
//{
//System.out.println("LINE_0: "+m.group(0));
//System.out.println("LINE_1: "+m.group(1));
//setLine(m.group(1));
//}