You're a newly hired engineer for a brand-new company that's building a "killer Word-like application". You've been specifically assigned to implement a tool that gives the user some details on common word usage, letter usage, and some other analytics for a given document! More specifically, you must read a given text file (no special formatting, just a plain ASCII text file) and print off the following details:
- Number of words
- Number of letters
- Number of symbols (any non-letter and non-digit character, excluding white spaces)
- Top three most common words (you may count "small words", such as "it" or "the")
- Top three most common letters
- Most common first word of a paragraph (paragraph being defined as a block of text with an empty line above it) (Optional bonus)
- Number of words only used once (Optional bonus)
- All letters not used in the document (Optional bonus)
Please note that your tool does not have to be case sensitive, meaning the word "Hello" is the same as "hello" and "HELLO".
0 ) {
echo "Error: " . $_FILES['inputFile']['error'] . "
";
} else {
//Get the filepath and open it
$fileLocation = $_FILES['inputFile']['tmp_name'];
//Open and convert to lowercase for case insensitivity
$lines = file( $fileLocation );
//Handle empty input file
if ( count($lines) < 1) {
echo "The text file supplied is empty. Please try again.
";
} else {
//File has been successfully opened and dumped into $lines, time to process
//Processing vars
$alphabet = array(
'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p',
'q', 'r', 's', 't', 'u', 'v', 'w', 'x',
'y', 'z', '0', '1', '2', '3', '4', '5',
'6', '7', '8', '9' ); //Alphanumeric characters
$letterUse = array(); //alphabet[i] is used letterUse[i] times
for( $i = 0; $i < 36; $i++ ) {
//Fill with 0s
array_push( $letterUse, 0 );
}
$allWords = array();
$wordUse = array(); //allWords[i] is used wordUse[i] times
$paragraphStarters = array();
$psFrequency = array();
$wordCount = 0;
$letterCount = 0;
$symbolCount = 0;
//We'll show the user the text being processed as we process it
echo "
Text Being Analyzed:
";
//Beginning of file counds as beginning of paragraph
$pb = true;
//Iterate over each line in the file
foreach( $lines as $L ) {
//If the line only contians a space, expect a paragraph to start
if( preg_match( '/^\\s\\n$/', $L ) ) {
//Print empty line and set flag
echo "
";
$pb = true;
} else {
//If there are words on the line, print them
echo "
" . $L . "
";
//Then split the line into words
$wordsInLine = strtolower( strtok( $L, ' ' ) );
//Handle beginning of paragraph
if( $pb ) {
if( !in_array( $wordsInLine, $paragraphStarters ) ) {
//New paragraph beginning, add it to tracking arrays
array_push( $paragraphStarters, $wordsInLine );
array_push( $psFrequency, 1 );
} else {
//It's in the array already, so increment its frequency
$index = array_search( $wordsInLine[0], $paragraphStarters );
$psFrequency[$index] += 1;
}
//Reset flag
$pb = false;
}
//Now iterate over each word in the line
while( $wordsInLine != false ) {
//Accumulate word
$wordCount += 1;
//Iterate over letters and strip symbols
$chars = str_split($wordsInLine);
$word = "";
foreach( $chars as $char ) {
if( ctype_alnum( $char ) ) {
$index = array_search( $char, $alphabet );
$letterUse[$index] += 1;
$letterCount += 1;
$word .= $char;
} else {
$symbolCount += 1;
}
}
//See if it's been used before
if( !in_array( $word, $allWords ) ) {
array_push( $allWords, $word );
array_push( $wordUse, 1 );
} else {
$index = array_search( $word, $allWords );
$wordUse[$index] += 1;
}
$wordsInLine = strtolower( strtok( ' ' ) );
}
}
}
echo "
";
echo "
Text Analysis:
";
//Challenge 1
echo "
Words in text: " . $wordCount . "
";
//Challenge 2
echo "
Letters in text: " . $letterCount . "
";
//Challenge 3
echo "
Number of non-alphanumerics: " . $symbolCount . "
";
//Challenge 4
echo "
Top 3 most common words:";
//Determine the most-used words
$sortedWordUse = $wordUse;
rsort( $sortedWordUse );
$sortedWordUse = array_unique( $sortedWordUse );
$sortedWordUse = array_values( $sortedWordUse );
for( $i = 0; $i < 3; $i++ ) {
echo "- " . $sortedWordUse[$i] . " uses:
";
foreach( $allWords as $key => $value ) {
if( $wordUse[$key] == $sortedWordUse[$i] ) {
echo "- " . $value . "
";
}
}
echo "
";
}
echo "
";
//Challenge 5
echo "
Top 3 most common letters:";
//Determine the most-used letters
$sortedLetterUse = $letterUse;
rsort( $sortedLetterUse );
$sortedLetterUse = array_unique( $sortedLetterUse );
$sortedLetterUse = array_values( $sortedLetterUse );
for( $i = 0; $i < 3; $i++ ) {
echo "- " . $sortedLetterUse[$i] . " uses:
";
foreach( $alphabet as $key => $value ) {
if( $letterUse[$key] == $sortedLetterUse[$i] ) {
echo "- " . $value . "
";
}
}
echo "
";
}
echo "
";
//Optional 1
echo "
Most common first word of a paragraph:
";
//Determine most common first words
$max = 0;
$index = 0;
foreach( $psFrequency as $value ) {
if( $value > $max ) {
$max = $value;
}
}
echo "
Starts " . $max . " paragraphs:";
foreach( $paragraphStarters as $key => $value ) {
if( $psFrequency[$key] == $max ) {
echo "- " . $value . "
";
}
}
echo "
";
//Optional 2
echo "
Words used only once: |
---|
";
$cells = 1;
for( $i = 0; $i < count($allWords); $i++ ) {
if( $wordUse[$i] == 1 ) {
echo "" . $allWords[$i] . " | ";
$cells += 1;
if( $cells > 5 ) {
echo "
";
$cells = 1;
}
}
}
//Finish off last row
while( $cells <= 5 ) {
echo " | ";
$cells += 1;
}
echo "
";
//Optional 3
echo "
Alphanumerics not used:";
for( $i = 0; $i < count($alphabet); $i++ ) {
if( $letterUse[$i] == 0 ) {
echo "- " . $alphabet[$i] . "
";
}
}
echo "
";
echo "
";
}
}
} else {
echo "
Please upload a text file to analyze.
";
}
?>