Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- <!DOCTYPE html>
- <html>
- <head>
- <title>Challenge #125 (Easy) - Word Analytics</title>
- <style type="text/css">
- * {
- margin: 0;
- padding: 0;
- }
- body {
- background: #142404;
- }
- #container {
- position: relative;
- max-width: 90%;
- margin: 30px auto 10px auto;
- background: #ADBD8D;
- }
- h1 {
- position: relative;
- max-width: 95%;
- margin: 10px auto;
- text-decoration: underline;
- }
- #challengeSpecs {
- display: inline-block;
- width: 100%;
- border-bottom: 2px solid black;
- }
- #challengeSpecs p {
- margin: 10px;
- padding: 5px;
- }
- #challengeSpecs ul {
- position: relative;
- width: 75%;
- margin: 10px auto 0 auto;
- }
- #challengeSpecs ul li {
- padding-left: 10px;
- }
- #challengeIO {
- position: relative;
- width: 90%;
- margin: 10px auto;
- margin-top: 0;
- border: 2px solid black;
- border-top: 0;
- padding: 5px;
- padding-top: 0;
- }
- #challengeIO ul, #challengeIO ol {
- position: relative;
- margin-left: 50px;;
- }
- #inputFormContainer {
- position: relative;
- text-align: center;
- width: 80%;
- margin: 20px auto;
- border: 1px solid black;
- border-bottom: 0;
- padding-bottom: 10px;
- }
- form {
- position: relative;
- margin: 10px auto;
- }
- table * {
- text-align: center;
- padding: 2px;
- }
- </style>
- </head>
- <body>
- <div id="container">
- <h1>/r/DailyProgramming Challenge #125 (Easy) - Word Analytics</h1>
- <div id="challengeSpecs">
- <p>
- You're a newly hired engineer for a brand-new company that's building a "killer Word-like application". You've been specifically assigned to implement a tool that gives the user some details on common word usage, letter usage, and some other analytics for a given document! More specifically, you must read a given text file (no special formatting, just a plain ASCII text file) and print off the following details:
- </p>
- <ul>
- <li>Number of words</li>
- <li>Number of letters</li>
- <li>Number of symbols (any non-letter and non-digit character, excluding white spaces)</li>
- <li>Top three most common words (you may count "small words", such as "it" or "the")</li>
- <li>Top three most common letters</li>
- <li>Most common first word of a paragraph (paragraph being defined as a block of text with an empty line above it) (Optional bonus)</li>
- <li>Number of words only used once (Optional bonus)</li>
- <li>All letters not used in the document (Optional bonus)</li>
- </ul>
- <p>Please note that your tool does not have to be case sensitive, meaning the word "Hello" is the same as "hello" and "HELLO".</p>
- </div>
- <div id="challengeIO">
- <?php
- //Get input from POST or ask for input
- if( isset( $_POST['submit'] ) ) {
- //Check input file for errors
- if( $_FILES['inputFile']['error'] > 0 ) {
- echo "Error: " . $_FILES['inputFile']['error'] . "<br/>";
- } else {
- //Get the filepath and open it
- $fileLocation = $_FILES['inputFile']['tmp_name'];
- //Open and convert to lowercase for case insensitivity
- $lines = file( $fileLocation );
- //Handle empty input file
- if ( count($lines) < 1) {
- echo "The text file supplied is empty. Please try again.<br/>";
- } else {
- //File has been successfully opened and dumped into $lines, time to process
- //Processing vars
- $alphabet = array(
- 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
- 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p',
- 'q', 'r', 's', 't', 'u', 'v', 'w', 'x',
- 'y', 'z', '0', '1', '2', '3', '4', '5',
- '6', '7', '8', '9' ); //Alphanumeric characters
- $letterUse = array(); //alphabet[i] is used letterUse[i] times
- for( $i = 0; $i < 36; $i++ ) {
- //Fill with 0s
- array_push( $letterUse, 0 );
- }
- $allWords = array();
- $wordUse = array(); //allWords[i] is used wordUse[i] times
- $paragraphStarters = array();
- $psFrequency = array();
- $wordCount = 0;
- $letterCount = 0;
- $symbolCount = 0;
- //We'll show the user the text being processed as we process it
- echo "<div id=\"inputText\"><br/><h2>Text Being Analyzed:</h2><br/><br/>";
- //Beginning of file counds as beginning of paragraph
- $pb = true;
- //Iterate over each line in the file
- foreach( $lines as $L ) {
- //If the line only contians a space, expect a paragraph to start
- if( preg_match( '/^\\s\\n$/', $L ) ) {
- //Print empty line and set flag
- echo "<br/>";
- $pb = true;
- } else {
- //If there are words on the line, print them
- echo "<p>" . $L . "</p>";
- //Then split the line into words
- $wordsInLine = strtolower( strtok( $L, ' ' ) );
- //Handle beginning of paragraph
- if( $pb ) {
- if( !in_array( $wordsInLine, $paragraphStarters ) ) {
- //New paragraph beginning, add it to tracking arrays
- array_push( $paragraphStarters, $wordsInLine );
- array_push( $psFrequency, 1 );
- } else {
- //It's in the array already, so increment its frequency
- $index = array_search( $wordsInLine[0], $paragraphStarters );
- $psFrequency[$index] += 1;
- }
- //Reset flag
- $pb = false;
- }
- //Now iterate over each word in the line
- while( $wordsInLine != false ) {
- //Accumulate word
- $wordCount += 1;
- //Iterate over letters and strip symbols
- $chars = str_split($wordsInLine);
- $word = "";
- foreach( $chars as $char ) {
- if( ctype_alnum( $char ) ) {
- $index = array_search( $char, $alphabet );
- $letterUse[$index] += 1;
- $letterCount += 1;
- $word .= $char;
- } else {
- $symbolCount += 1;
- }
- }
- //See if it's been used before
- if( !in_array( $word, $allWords ) ) {
- array_push( $allWords, $word );
- array_push( $wordUse, 1 );
- } else {
- $index = array_search( $word, $allWords );
- $wordUse[$index] += 1;
- }
- $wordsInLine = strtolower( strtok( ' ' ) );
- }
- }
- }
- echo "</div><hr/>";
- echo "<div id=\"consoleText\"><br/><h2>Text Analysis:</h2>";
- //Challenge 1
- echo "<br/><p>Words in text: " . $wordCount . "</p>";
- //Challenge 2
- echo "<br/><p>Letters in text: " . $letterCount . "</p>";
- //Challenge 3
- echo "<br/><p>Number of non-alphanumerics: " . $symbolCount . "</p>";
- //Challenge 4
- echo "<br/><ol>Top 3 most common words:";
- //Determine the most-used words
- $sortedWordUse = $wordUse;
- rsort( $sortedWordUse );
- $sortedWordUse = array_unique( $sortedWordUse );
- $sortedWordUse = array_values( $sortedWordUse );
- for( $i = 0; $i < 3; $i++ ) {
- echo "<li>" . $sortedWordUse[$i] . " uses:<ul>";
- foreach( $allWords as $key => $value ) {
- if( $wordUse[$key] == $sortedWordUse[$i] ) {
- echo "<li>" . $value . "</li>";
- }
- }
- echo "</ul></li>";
- }
- echo "</ol>";
- //Challenge 5
- echo "<br/><ol>Top 3 most common letters:";
- //Determine the most-used letters
- $sortedLetterUse = $letterUse;
- rsort( $sortedLetterUse );
- $sortedLetterUse = array_unique( $sortedLetterUse );
- $sortedLetterUse = array_values( $sortedLetterUse );
- for( $i = 0; $i < 3; $i++ ) {
- echo "<li>" . $sortedLetterUse[$i] . " uses:<ul>";
- foreach( $alphabet as $key => $value ) {
- if( $letterUse[$key] == $sortedLetterUse[$i] ) {
- echo "<li>" . $value . "</li>";
- }
- }
- echo "</ul></li>";
- }
- echo "</ol>";
- //Optional 1
- echo "<br/><p>Most common first word of a paragraph:</p>";
- //Determine most common first words
- $max = 0;
- $index = 0;
- foreach( $psFrequency as $value ) {
- if( $value > $max ) {
- $max = $value;
- }
- }
- echo "<ul>Starts " . $max . " paragraphs:";
- foreach( $paragraphStarters as $key => $value ) {
- if( $psFrequency[$key] == $max ) {
- echo "<li>" . $value . "</li>";
- }
- }
- echo "</ul>";
- //Optional 2
- echo "<br/><table><tr><th colspan=5>Words used only once:</th></tr><tr>";
- $cells = 1;
- for( $i = 0; $i < count($allWords); $i++ ) {
- if( $wordUse[$i] == 1 ) {
- echo "<td>" . $allWords[$i] . "</td>";
- $cells += 1;
- if( $cells > 5 ) {
- echo "</tr><tr>";
- $cells = 1;
- }
- }
- }
- //Finish off last row
- while( $cells <= 5 ) {
- echo "<td></td>";
- $cells += 1;
- }
- echo "</tr></table>";
- //Optional 3
- echo "<br/><ul>Alphanumerics not used:";
- for( $i = 0; $i < count($alphabet); $i++ ) {
- if( $letterUse[$i] == 0 ) {
- echo "<li>" . $alphabet[$i] . "</li>";
- }
- }
- echo "</ul>";
- echo "</div>";
- }
- }
- } else {
- echo "<p>Please upload a text file to analyze.</p>";
- }
- ?>
- </div>
- <div id="inputFormContainer">
- <form action="./projects.php" method="post" id="inputForm" enctype="multipart/form-data">
- <p>Enter text to analyze:</p>
- <input type="file" name="inputFile" />
- <input type="submit" name="submit" value="Submit" />
- </form>
- </div>
- </div>
- </body>
- </html>
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement