Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include "stdafx.h"
- #include <iostream>
- #include <fstream>
- #include <iterator>
- #include <string>
- #include <cstring>
- #include "/.../dirent.h"
- #include "boostregex.hpp"
- #include <boost/tokenizer.hpp>
- using namespace std;
- using namespace boost;
- int main() {
- DIR* dir;
- dirent* pdir;
- dir = opendir("D:/.../dataset/"); // open current directory
- int number_of_words=0;
- char filename[300];
- int i=0;
- while (pdir = readdir(dir))
- {
- string fileString;
- strcpy(filename, "D:/.../dataset/");
- strcat(filename, pdir->d_name);
- ifstream file(filename);
- std::istream_iterator<std::string> beg(file), end;
- number_of_words = distance(beg,end);
- int *wordIndexes = new int[number_of_words +1];
- int index = 0;
- wordIndexes[0] = 0;
- cout<<"Number of words in file: "<<number_of_words<<endl;
- ifstream files(filename);
- if (file.is_open())
- {
- string output;
- while (!files.eof())
- {
- //read word by word
- files >> output;
- fileString += " ";
- fileString += output;
- }
- string fileStringTokenized;
- tokenizer<> tok(fileString);
- for(tokenizer<>::iterator beg=tok.begin(); beg!=tok.end(); ++beg)
- {
- string currentWord;
- currentWord = *beg;
- index += currentWord.size();
- wordIndexes[i] = index;
- i++;
- //cout<<*beg<<"n";
- fileStringTokenized += " ";
- fileStringTokenized += *beg;
- }
- cout<<"Number of characters: "<<fileStringTokenized.size()<<endl;
- const char *charString = fileStringTokenized.c_str();
- //cout<<charString;
- cout<<endl;
- }
- file.close();
- delete []wordIndexes;
- }
- closedir(dir);
- return 0;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement