Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <stdio.h>
- #include <cctype>
- #include <vector>
- #include <string.h>
- #include <regex.h>
- #include <boost/algorithm/string.hpp>
- #include <boost/regex.hpp>
- #include <algorithm>
- #include <map>
- #include <mysql.h>
- using namespace std;
- using namespace boost;
- string StripTags(string HTML);
- // -----------------------------------------------------------------------------
- string StripTags(string HTML)
- {
- bool InTag=false;
- int Counter=0;
- while(Counter<HTML.length())
- {
- if("<"==HTML.substr(Counter,1))
- {
- InTag=true;
- }
- else if(">"==HTML.substr(Counter,1))
- {
- HTML.replace(Counter,1,"");
- Counter--;
- InTag=false;
- }
- if(true==InTag)
- {
- HTML.replace(Counter,1,"");
- Counter--;
- }
- Counter++;
- }
- return HTML;
- }
- string tableName = "keywords";
- // -----------------------------------------------------------------------------
- int iMaxWords = 3;
- class oPhraseParser {
- private:
- vector<string> aList;
- vector<string> aPhraseList;
- map<string, int> PhraseData;
- bool is_number(const std::string& s)
- {
- for (int i = 0; i < s.length(); i++) {
- if (!std::isdigit(s[i]) )
- if (s[i] != '.') {
- return false;
- }
- }
- return true;
- }
- bool allowParse(string Word) {
- if (is_number(Word)) {
- return false;
- }
- return true;
- }
- void addPhrase(vector<string>strs, int iNr, int iPlus) {
- int i = 0;
- string phrase = "";
- int iCurrent2, iWordsCount;
- iWordsCount = 0;
- for (i=0; i<=iPlus; i++) {
- iCurrent2 = iNr + i;
- if (strs.size() > iCurrent2) {
- string myWord = strs[iCurrent2];
- trim(myWord);
- if (!is_number(myWord)) {
- phrase = phrase + " " + myWord;
- iWordsCount++;
- }
- }
- }
- trim(phrase);
- if (iWordsCount > 1) {
- if (PhraseData[phrase]) {
- PhraseData[phrase]++;
- }
- else {
- PhraseData[phrase] = 1;
- }
- }
- }
- public:
- void getResult() {
- printf("DROP TABLE IF EXISTS %s;\n", tableName.c_str());
- printf("CREATE TABLE `%s` (`keyword` varchar(255) NOT NULL,`freq` int(11) NOT NULL) ENGINE=MyISAM DEFAULT CHARSET=utf8;\n", tableName.c_str());
- int i = 0;
- for( map<string, int>::iterator ii=PhraseData.begin(); ii!=PhraseData.end(); ++ii)
- {
- if ((*ii).second > 1 && (*ii).first!="") {
- if (i!=0) {
- printf (",\n");
- }
- else {
- printf("INSERT INTO `%s` VALUES ", tableName.c_str());
- }
- i++;
- // char *s1 = (*ii).first;
- // char *s2 = (*ii).second;
- int skaicius = (*ii).second;
- printf("('%s'",(*ii).first.c_str());
- printf(",'%d')",skaicius);
- if (i >= 1000) {
- printf (";\n");
- i = 0;
- }
- }
- }
- if (i != 0) {
- printf (";");
- }
- }
- void add(string text) {
- vector<string> strs;
- int iNr;
- trim(text); //triming whitespaces
- text.erase(std::remove(text.begin(), text.end(), '\n'), text.end()); //remove new line
- text = StripTags(text);
- boost::split(strs, text, boost::is_any_of(" .:;,"));
- for (iNr = 0; iNr < strs.size(); iNr++) {
- if (allowParse(strs[iNr]) == true) {
- int iCurrent = 0;
- while (iCurrent < iMaxWords && (iCurrent+iNr) < strs.size()) {
- addPhrase(strs, iNr, iCurrent);
- iCurrent++;
- }
- }
- }
- }
- };
- int main (int argc, const char * argv[])
- {
- oPhraseParser oParser;
- MYSQL *conn;
- MYSQL_RES *res;
- MYSQL_ROW row;
- char *server = "localhost";
- char *user = "root";
- char *password = "vaidaszilionis";
- char *database = "sphinx";
- conn = mysql_init(NULL);
- if (!mysql_real_connect(conn, server,
- user, password, database, 0, NULL, 0)) {
- fprintf(stderr, "%s\n", mysql_error(conn));
- exit(1);
- }
- if (mysql_query(conn, "Select concat(title, ' ',description,' ',body) as mytext FROM temp limit 100")) {
- fprintf(stderr, "%s\n", mysql_error(conn));
- exit(1);
- }
- res = mysql_use_result(conn);
- //output table name
- while ((row = mysql_fetch_row(res)) != NULL) {
- oParser.add(row[0]);
- }
- mysql_free_result(res);
- mysql_close(conn);
- oParser.getResult();
- return 0;
- }
Add Comment
Please, Sign In to add comment