Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- package markov;
- import java.io.File;
- import java.io.IOException;
- import java.util.ArrayList;
- import java.util.Arrays;
- import java.util.Collections;
- import java.util.HashMap;
- import java.util.Random;
- import java.util.Scanner;
- /**
- * A Class for generating Markov sentence chains from an input locus. Markov
- * chains are a relational probabilistic data sets, and in this case are used
- * to generate sentences from an input chain. This class is taken from a small
- * project that originally powered a twitter bot.
- *
- * @author Mason C***** <Mason.C***** at s*****.edu>
- */
- public class Markov {
- /**
- * The HashMap containing the Markov chain itself. Being relational, the
- * chain contains data for preceding words and for a single word following.
- *
- * Ex) With a prefix size of 2, a chain might look as such:
- * The Quick | Brown
- * Quick Brown | Fox
- * The Quick | gunslinger
- *
- * If the chain generates "The Quick" at any point, there will be a 50/50
- * percent chance of it outputting "Brown" or "gunslinger" next.
- */
- private final HashMap<String, ArrayList<String>> markovChain;
- private final Random rnd;
- private int prefixLength = 20;
- public static void main(String[] args) throws IOException {
- // Create the first two entries (k:_start, k:_end)
- Scanner in = new Scanner(System.in);
- int prefixSize;
- //Input handling
- while (true) {
- try {
- System.out.print("Enter prefix size: ");
- //prefixSize = in.nextInt();
- prefixSize = Integer.parseInt(in.nextLine());
- break;
- } catch (NumberFormatException e) {
- System.out.println("Invalid input. Must be an integer.");
- }
- }
- int sentences;
- while (true) {
- try {
- System.out.print("Enter number of sentences to generate: ");
- sentences = Integer.parseInt(in.nextLine());
- break;
- } catch (NumberFormatException e) {
- System.out.println("Invalid input. Must be an integer.");
- }
- }
- Markov markov = new Markov(prefixSize);
- while (true) {
- /**
- * Get words. The larger the input locus, the better! A prefix size
- * of 1 should have a locus of a couple sentences, a size of 2 needs
- * several pages of text, and 3 requires the entire text of Moby
- * Dick to not seem deterministic.
- *
- */
- Scanner fileIn;
- while (true) {
- try {
- System.out.print("Enter the file to read: ");
- File file = new File(in.nextLine());
- fileIn = new Scanner(file, "UTF-8");
- break;
- } catch (IOException e) {
- System.out.println("File could not be read: " + e.getMessage());
- }
- }
- //pun unintentional
- StringBuilder bodyBuilder = new StringBuilder();
- //Read file to end
- while (fileIn.hasNext()) {
- bodyBuilder.append(fileIn.nextLine()).append(" ");
- }
- markov.addWords(bodyBuilder.toString());
- markov.generateSentence(sentences);
- }
- }
- public Markov(int prefixLength) {
- rnd = new Random();
- this.prefixLength = prefixLength;
- //Instantiate the HashMap and prep it for generation
- markovChain = new HashMap();
- //Special cases for beginnings and ends of sentences
- markovChain.put("_start", new ArrayList());
- markovChain.put("_end", new ArrayList());
- }
- public synchronized void addWords(String words) {
- //C:\Users\mrsma\OneDrive\Documents\sherlock holmes.txt
- String[] sentences = words.split("(?<=[.\\?\\!])");
- System.out.println("Adding " + sentences.length + " sentences to the chain and generating output...");
- for (int i = 0; i < sentences.length; i++) {
- String sentence = sentences[i];
- addSentence(sentence.trim());
- }
- }
- /**
- * Add a sentence to the Markov Chain. Loops through every word in the
- * sentence, and adds to the Markov chain based on the length of
- * prefixLength
- *
- * @param sentence the sentence to add do the Markov chain
- */
- public synchronized void addSentence(String sentence) {
- String[] words = sentence.split(" ");
- //Early return for empty strings
- if (words.length < prefixLength) {
- return;
- }
- //The prefix being added to the Markov Chain
- String prefix;
- StringBuilder prefixBuilder = new StringBuilder();
- for (int i = 0; i < words.length - prefixLength; i++) {
- prefixBuilder.setLength(0);
- for (int u = 0; u < prefixLength; u++) {
- //Add words to prefix
- prefixBuilder.append(words[i + u]).append(" ");
- }
- prefix = prefixBuilder.toString();
- //Skip empty prefixes
- if (prefix.isEmpty() || prefix.equals(" ")) {
- continue;
- }
- //Normalize the prefix to remove punctuation
- String trimmed = prefix.replaceAll("[^A-Za-z0-9 ]", "").trim();
- //continue if the prefix is invalid
- if (trimmed == null
- || trimmed.isEmpty()
- || trimmed.split(" ").length != prefixLength) {
- continue;
- }
- if (i == 0) {
- //Words in the _start section begin sentences
- ArrayList<String> startWords = markovChain.get("_start");
- startWords.add(prefix.trim());
- }
- //Adds the next word to the suffix pool at the given prefix
- ArrayList<String> suffix = markovChain.get(prefix);
- //If the suffix list is not yet instantiated for this entry, create a new one
- if (suffix == null) {
- suffix = new ArrayList();
- }
- //Add to the suffix and then add it to the chain
- suffix.add(words[i + prefixLength].trim());
- markovChain.put(prefix.trim().toLowerCase(), suffix);
- }
- //Words in the _end category end sentences
- ArrayList<String> endWords = markovChain.get("_end");
- endWords.add(words[words.length - 1].trim());
- }
- /**
- * Generate a phrase using the Markov chain with given sentence length
- *
- * @param sentences how many sentences to generate
- */
- public synchronized void generateSentence(int sentences) {
- //Early return if chain is built incorrectly
- if (markovChain.size() < 1) {
- return;
- }
- //ArrayList containing constructed return phrase
- ArrayList<String> newPhrase = new ArrayList();
- //All possible sentence beginnings from the locus. If none, terminate.
- ArrayList<String> startWords = markovChain.get("_start");
- int startWordsLen = startWords.size();
- ArrayList<String> endWords = markovChain.get("_end");
- int endWordsLen = endWords.size();
- if (startWordsLen < 1 || endWordsLen < 0) {
- return;
- }
- //Grabs random sentance start
- String nextWord = startWords.get(rnd.nextInt(startWordsLen));
- newPhrase.add(nextWord);
- ArrayList<String> previousWords = new ArrayList(Arrays.asList(nextWord.split(" ")));
- Collections.reverse(previousWords);
- previousWords.ensureCapacity(prefixLength);
- String prefix;
- //Cache for attempted sentence beginnings.
- String addPrevious = null;
- StringBuilder prefixBuilder = new StringBuilder();
- // Keep looping through the words until all sentences are constructed
- for (int i = 0; i < sentences;) {
- prefixBuilder.setLength(0);
- //Cut down words list to be same length as prefix size
- previousWords = new ArrayList(previousWords.subList(0, prefixLength));
- //Build a prefix
- for (int u = previousWords.size() - 1; u >= 0; u--) {
- prefixBuilder.append(previousWords.get(u)).append(" ");
- }
- prefix = prefixBuilder.toString().trim();
- ArrayList<String> wordSelection = markovChain.get(prefix.toLowerCase());
- //If there is no text within the prefix, then get rid of the sentence
- //beginning and start a new sentence.
- if (wordSelection == null || wordSelection.isEmpty()) {
- //Remove the prefix from startWords and markovChain, if they exist there
- startWords.remove(prefix);
- markovChain.remove(prefix.toLowerCase());
- //Get a random new next word set to begin a new sentence
- nextWord = startWords.get(rnd.nextInt(startWordsLen));
- previousWords = new ArrayList(Arrays.asList(nextWord.split(" ")));
- //Reverse the set to match the insertion order
- Collections.reverse(previousWords);
- //Cache previous words
- addPrevious = nextWord;
- continue;
- }else{
- //If sentence beginning is valid and there was a sentence reattempt,
- //carry the cached beginning into the phrase.
- if(addPrevious != null){
- newPhrase.add(addPrevious);
- addPrevious = null;
- }
- }
- //Randomly select a word from the prefix's pool. More common words will
- //appear multiple times, making them more probable.
- int wordSelectionLen = wordSelection.size();
- nextWord = wordSelection.get(rnd.nextInt(wordSelectionLen));
- previousWords.add(0, nextWord);
- int length = nextWord.length();
- if (length > 0) {
- newPhrase.add(nextWord);
- //Did the sentence terminate? If so, incriment
- if (nextWord.matches("(.*?)[\\.\\?\\!](.*?)")) {
- i++;
- }
- }
- }
- StringBuilder outputBuilder = new StringBuilder();
- for (int i = 0; i < newPhrase.size(); i++) {
- String word = newPhrase.get(i);
- outputBuilder.append(word).append(" ");
- if (i % 20 == 0 && i != 0) {
- outputBuilder.append("\n");
- }
- }
- System.out.println(outputBuilder.toString());
- }
- public static String output;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement