<!DOCTYPE html>
<html>
<head>
<title>Challenge #125 (Easy) - Word Analytics</title>
<style type="text/css">
* {
margin: 0;
padding: 0;
}
body {
background: #142404;
}
#container {
position: relative;
max-width: 90%;
margin: 30px auto 10px auto;
background: #ADBD8D;
}
h1 {
position: relative;
max-width: 95%;
margin: 10px auto;
text-decoration: underline;
}
#challengeSpecs {
display: inline-block;
width: 100%;
border-bottom: 2px solid black;
}
#challengeSpecs p {
margin: 10px;
padding: 5px;
}
#challengeSpecs ul {
position: relative;
width: 75%;
margin: 10px auto 0 auto;
}
#challengeSpecs ul li {
padding-left: 10px;
}
#challengeIO {
position: relative;
width: 90%;
margin: 10px auto;
margin-top: 0;
border: 2px solid black;
border-top: 0;
padding: 5px;
padding-top: 0;
}
#challengeIO ul, #challengeIO ol {
position: relative;
margin-left: 50px;;
}
#inputFormContainer {
position: relative;
text-align: center;
width: 80%;
margin: 20px auto;
border: 1px solid black;
border-bottom: 0;
padding-bottom: 10px;
}
form {
position: relative;
margin: 10px auto;
}
table * {
text-align: center;
padding: 2px;
}
</style>
</head>
<body>
<div id="container">
<h1>/r/DailyProgramming Challenge #125 (Easy) - Word Analytics</h1>
<div id="challengeSpecs">
<p>
You're a newly hired engineer for a brand-new company that's building a "killer Word-like application". You've been specifically assigned to implement a tool that gives the user some details on common word usage, letter usage, and some other analytics for a given document! More specifically, you must read a given text file (no special formatting, just a plain ASCII text file) and print off the following details:
</p>
<ul>
<li>Number of words</li>
<li>Number of letters</li>
<li>Number of symbols (any non-letter and non-digit character, excluding white spaces)</li>
<li>Top three most common words (you may count "small words", such as "it" or "the")</li>
<li>Top three most common letters</li>
<li>Most common first word of a paragraph (paragraph being defined as a block of text with an empty line above it) (Optional bonus)</li>
<li>Number of words only used once (Optional bonus)</li>
<li>All letters not used in the document (Optional bonus)</li>
</ul>
<p>Please note that your tool does not have to be case sensitive, meaning the word "Hello" is the same as "hello" and "HELLO".</p>
</div>
<div id="challengeIO">
<?php
//Get input from POST or ask for input
if( isset( $_POST['submit'] ) ) {
//Check input file for errors
if( $_FILES['inputFile']['error'] > 0 ) {
echo "Error: " . $_FILES['inputFile']['error'] . "<br/>";
} else {
//Get the filepath and open it
$fileLocation = $_FILES['inputFile']['tmp_name'];
//Open and convert to lowercase for case insensitivity
$lines = file( $fileLocation );
//Handle empty input file
if ( count($lines) < 1) {
echo "The text file supplied is empty. Please try again.<br/>";
} else {
//File has been successfully opened and dumped into $lines, time to process
//Processing vars
$alphabet = array(
'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p',
'q', 'r', 's', 't', 'u', 'v', 'w', 'x',
'y', 'z', '0', '1', '2', '3', '4', '5',
'6', '7', '8', '9' ); //Alphanumeric characters
$letterUse = array(); //alphabet[i] is used letterUse[i] times
for( $i = 0; $i < 36; $i++ ) {
//Fill with 0s
array_push( $letterUse, 0 );
}
$allWords = array();
$wordUse = array(); //allWords[i] is used wordUse[i] times
$paragraphStarters = array();
$psFrequency = array();
$wordCount = 0;
$letterCount = 0;
$symbolCount = 0;
//We'll show the user the text being processed as we process it
echo "<div id=\"inputText\"><br/><h2>Text Being Analyzed:</h2><br/><br/>";
//Beginning of file counds as beginning of paragraph
$pb = true;
//Iterate over each line in the file
foreach( $lines as $L ) {
//If the line only contians a space, expect a paragraph to start
if( preg_match( '/^\\s\\n$/', $L ) ) {
//Print empty line and set flag
echo "<br/>";
$pb = true;
} else {
//If there are words on the line, print them
echo "<p>" . $L . "</p>";
//Then split the line into words
$wordsInLine = strtolower( strtok( $L, ' ' ) );
//Handle beginning of paragraph
if( $pb ) {
if( !in_array( $wordsInLine, $paragraphStarters ) ) {
//New paragraph beginning, add it to tracking arrays
array_push( $paragraphStarters, $wordsInLine );
array_push( $psFrequency, 1 );
} else {
//It's in the array already, so increment its frequency
$index = array_search( $wordsInLine[0], $paragraphStarters );
$psFrequency[$index] += 1;
}
//Reset flag
$pb = false;
}
//Now iterate over each word in the line
while( $wordsInLine != false ) {
//Accumulate word
$wordCount += 1;
//Iterate over letters and strip symbols
$chars = str_split($wordsInLine);
$word = "";
foreach( $chars as $char ) {
if( ctype_alnum( $char ) ) {
$index = array_search( $char, $alphabet );
$letterUse[$index] += 1;
$letterCount += 1;
$word .= $char;
} else {
$symbolCount += 1;
}
}
//See if it's been used before
if( !in_array( $word, $allWords ) ) {
array_push( $allWords, $word );
array_push( $wordUse, 1 );
} else {
$index = array_search( $word, $allWords );
$wordUse[$index] += 1;
}
$wordsInLine = strtolower( strtok( ' ' ) );
}
}
}
echo "</div><hr/>";
echo "<div id=\"consoleText\"><br/><h2>Text Analysis:</h2>";
//Challenge 1
echo "<br/><p>Words in text: " . $wordCount . "</p>";
//Challenge 2
echo "<br/><p>Letters in text: " . $letterCount . "</p>";
//Challenge 3
echo "<br/><p>Number of non-alphanumerics: " . $symbolCount . "</p>";
//Challenge 4
echo "<br/><ol>Top 3 most common words:";
//Determine the most-used words
$sortedWordUse = $wordUse;
rsort( $sortedWordUse );
$sortedWordUse = array_unique( $sortedWordUse );
$sortedWordUse = array_values( $sortedWordUse );
for( $i = 0; $i < 3; $i++ ) {
echo "<li>" . $sortedWordUse[$i] . " uses:<ul>";
foreach( $allWords as $key => $value ) {
if( $wordUse[$key] == $sortedWordUse[$i] ) {
echo "<li>" . $value . "</li>";
}
}
echo "</ul></li>";
}
echo "</ol>";
//Challenge 5
echo "<br/><ol>Top 3 most common letters:";
//Determine the most-used letters
$sortedLetterUse = $letterUse;
rsort( $sortedLetterUse );
$sortedLetterUse = array_unique( $sortedLetterUse );
$sortedLetterUse = array_values( $sortedLetterUse );
for( $i = 0; $i < 3; $i++ ) {
echo "<li>" . $sortedLetterUse[$i] . " uses:<ul>";
foreach( $alphabet as $key => $value ) {
if( $letterUse[$key] == $sortedLetterUse[$i] ) {
echo "<li>" . $value . "</li>";
}
}
echo "</ul></li>";
}
echo "</ol>";
//Optional 1
echo "<br/><p>Most common first word of a paragraph:</p>";
//Determine most common first words
$max = 0;
$index = 0;
foreach( $psFrequency as $value ) {
if( $value > $max ) {
$max = $value;
}
}
echo "<ul>Starts " . $max . " paragraphs:";
foreach( $paragraphStarters as $key => $value ) {
if( $psFrequency[$key] == $max ) {
echo "<li>" . $value . "</li>";
}
}
echo "</ul>";
//Optional 2
echo "<br/><table><tr><th colspan=5>Words used only once:</th></tr><tr>";
$cells = 1;
for( $i = 0; $i < count($allWords); $i++ ) {
if( $wordUse[$i] == 1 ) {
echo "<td>" . $allWords[$i] . "</td>";
$cells += 1;
if( $cells > 5 ) {
echo "</tr><tr>";
$cells = 1;
}
}
}
//Finish off last row
while( $cells <= 5 ) {
echo "<td></td>";
$cells += 1;
}
echo "</tr></table>";
//Optional 3
echo "<br/><ul>Alphanumerics not used:";
for( $i = 0; $i < count($alphabet); $i++ ) {
if( $letterUse[$i] == 0 ) {
echo "<li>" . $alphabet[$i] . "</li>";
}
}
echo "</ul>";
echo "</div>";
}
}
} else {
echo "<p>Please upload a text file to analyze.</p>";
}
?>
</div>
<div id="inputFormContainer">
<form action="./projects.php" method="post" id="inputForm" enctype="multipart/form-data">
<p>Enter text to analyze:</p>
<input type="file" name="inputFile" />
<input type="submit" name="submit" value="Submit" />
</form>
</div>
</div>
</body>
</html>