/r/DailyProgramming Challenge #125 (Easy) - Word Analytics

You're a newly hired engineer for a brand-new company that's building a "killer Word-like application". You've been specifically assigned to implement a tool that gives the user some details on common word usage, letter usage, and some other analytics for a given document! More specifically, you must read a given text file (no special formatting, just a plain ASCII text file) and print off the following details:

Please note that your tool does not have to be case sensitive, meaning the word "Hello" is the same as "hello" and "HELLO".

0 ) { echo "Error: " . $_FILES['inputFile']['error'] . "
"; } else { //Get the filepath and open it $fileLocation = $_FILES['inputFile']['tmp_name']; //Open and convert to lowercase for case insensitivity $lines = file( $fileLocation ); //Handle empty input file if ( count($lines) < 1) { echo "The text file supplied is empty. Please try again.
"; } else { //File has been successfully opened and dumped into $lines, time to process //Processing vars $alphabet = array( 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9' ); //Alphanumeric characters $letterUse = array(); //alphabet[i] is used letterUse[i] times for( $i = 0; $i < 36; $i++ ) { //Fill with 0s array_push( $letterUse, 0 ); } $allWords = array(); $wordUse = array(); //allWords[i] is used wordUse[i] times $paragraphStarters = array(); $psFrequency = array(); $wordCount = 0; $letterCount = 0; $symbolCount = 0; //We'll show the user the text being processed as we process it echo "

Text Being Analyzed:



"; //Beginning of file counds as beginning of paragraph $pb = true; //Iterate over each line in the file foreach( $lines as $L ) { //If the line only contians a space, expect a paragraph to start if( preg_match( '/^\\s\\n$/', $L ) ) { //Print empty line and set flag echo "
"; $pb = true; } else { //If there are words on the line, print them echo "

" . $L . "

"; //Then split the line into words $wordsInLine = strtolower( strtok( $L, ' ' ) ); //Handle beginning of paragraph if( $pb ) { if( !in_array( $wordsInLine, $paragraphStarters ) ) { //New paragraph beginning, add it to tracking arrays array_push( $paragraphStarters, $wordsInLine ); array_push( $psFrequency, 1 ); } else { //It's in the array already, so increment its frequency $index = array_search( $wordsInLine[0], $paragraphStarters ); $psFrequency[$index] += 1; } //Reset flag $pb = false; } //Now iterate over each word in the line while( $wordsInLine != false ) { //Accumulate word $wordCount += 1; //Iterate over letters and strip symbols $chars = str_split($wordsInLine); $word = ""; foreach( $chars as $char ) { if( ctype_alnum( $char ) ) { $index = array_search( $char, $alphabet ); $letterUse[$index] += 1; $letterCount += 1; $word .= $char; } else { $symbolCount += 1; } } //See if it's been used before if( !in_array( $word, $allWords ) ) { array_push( $allWords, $word ); array_push( $wordUse, 1 ); } else { $index = array_search( $word, $allWords ); $wordUse[$index] += 1; } $wordsInLine = strtolower( strtok( ' ' ) ); } } } echo "

"; echo "

Text Analysis:

"; //Challenge 1 echo "

Words in text: " . $wordCount . "

"; //Challenge 2 echo "

Letters in text: " . $letterCount . "

"; //Challenge 3 echo "

Number of non-alphanumerics: " . $symbolCount . "

"; //Challenge 4 echo "
    Top 3 most common words:"; //Determine the most-used words $sortedWordUse = $wordUse; rsort( $sortedWordUse ); $sortedWordUse = array_unique( $sortedWordUse ); $sortedWordUse = array_values( $sortedWordUse ); for( $i = 0; $i < 3; $i++ ) { echo "
  1. " . $sortedWordUse[$i] . " uses:
      "; foreach( $allWords as $key => $value ) { if( $wordUse[$key] == $sortedWordUse[$i] ) { echo "
    • " . $value . "
    • "; } } echo "
  2. "; } echo "
"; //Challenge 5 echo "
    Top 3 most common letters:"; //Determine the most-used letters $sortedLetterUse = $letterUse; rsort( $sortedLetterUse ); $sortedLetterUse = array_unique( $sortedLetterUse ); $sortedLetterUse = array_values( $sortedLetterUse ); for( $i = 0; $i < 3; $i++ ) { echo "
  1. " . $sortedLetterUse[$i] . " uses:
      "; foreach( $alphabet as $key => $value ) { if( $letterUse[$key] == $sortedLetterUse[$i] ) { echo "
    • " . $value . "
    • "; } } echo "
  2. "; } echo "
"; //Optional 1 echo "

Most common first word of a paragraph:

"; //Determine most common first words $max = 0; $index = 0; foreach( $psFrequency as $value ) { if( $value > $max ) { $max = $value; } } echo "
    Starts " . $max . " paragraphs:"; foreach( $paragraphStarters as $key => $value ) { if( $psFrequency[$key] == $max ) { echo "
  • " . $value . "
  • "; } } echo "
"; //Optional 2 echo "
"; $cells = 1; for( $i = 0; $i < count($allWords); $i++ ) { if( $wordUse[$i] == 1 ) { echo ""; $cells += 1; if( $cells > 5 ) { echo ""; $cells = 1; } } } //Finish off last row while( $cells <= 5 ) { echo ""; $cells += 1; } echo "
Words used only once:
" . $allWords[$i] . "
"; //Optional 3 echo "
    Alphanumerics not used:"; for( $i = 0; $i < count($alphabet); $i++ ) { if( $letterUse[$i] == 0 ) { echo "
  • " . $alphabet[$i] . "
  • "; } } echo "
"; echo "
"; } } } else { echo "

Please upload a text file to analyze.

"; } ?>

Enter text to analyze: