Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <stdio.h>
- #include <stdlib.h>
- #define MAXN 100000 // assume that maximum number of words is MAXN
- #define ALLCHAR 256 // assume that maximum distinct alphabets is about ALLCHAR
- #define MAXCHAR 256 // assume that maximum length is MAXCHAR/3
- #define TRUE 1
- #define FALSE 0
- #define FILENAME "thaiwords.txt"
- typedef struct{
- int bucket[ALLCHAR], id;
- }Group;
- typedef char bool;
- char word[MAXN][MAXCHAR];
- Group group[MAXN];
- int cmp(const void* aa, const void* bb){
- Group* a = (Group*)aa;
- Group* b = (Group*)bb;
- int i;
- for(i = 0; i < ALLCHAR; ++i)
- if(a->bucket[i] != b->bucket[i]) return a->bucket[i] - b->bucket[i];
- return 0;
- }
- bool is_equal(int a, int b){
- int i;
- for(i = 0; i < ALLCHAR; ++i)
- if(group[a].bucket[i] != group[b].bucket[i]) return FALSE;
- return TRUE;
- }
- int main(){
- int n = 0, i, len;
- FILE* fp = fopen(FILENAME, "r");
- if(!fp) printf("File not found!\n"), exit(1);
- while(fgets(word[n], MAXCHAR,fp)){
- for(i = 0; word[n][i] != 0; ++i){
- if(word[n][i] == ' ') continue;
- group[n].bucket[(unsigned char)word[n][i]]++;
- }
- group[n].id = n;
- n++;
- }
- qsort(group, n, sizeof(group[0]), cmp);
- int allgroup = 0;
- printf("%s", word[group[0].id]); // note that fgets() get '\n' character.
- allgroup = 1;
- for(i = 1; i < n; ++i){
- if(is_equal(i - 1, i)){
- printf("%s", word[group[i].id]);
- }else{
- printf("\n");
- printf("%s", word[group[i].id]);
- allgroup++;
- }
- }
- printf("\nall groups = %d, all words = %d\n", allgroup, n);
- fclose(fp);
- return 0;
- }
- /*
- Complexity:
- O((MAXN * ALLCHAR * log(MAXN)) + (MAXN * MAXCHAR))
- It is good to use this algorithm when ALLCHAR is low, but MAXCHAR is high.
- With suffix tree or string hashing, this bound would be
- O((MAXN * log(MAXN) * log(MAXCHAR)) + (MAXN * MAXCHAR * log(MAXCHAR))).
- It is good to use this algorithm when ALLCHAR is high, but MAXCHAR is low.
- Note:
- From http://www.acc.umu.se/~saasha/charsets/?charset=iso-8859-11&charset=iso-8859-11
- Each Thai character becomes three numbers.
- 'ก' = 224, 184, 129
- 'ข' = 224, 184, 130
- ...
- '๛' = 224, 185, 155
- Luckily, although we separate a single character to three numbers, there will not be the vague case; that is, we don't have to concern about false positive outcome.
- Testcase:
- sadadsasdasddsaasdasdasdads
- สวัสดีabc
- asdadsasdasdasdasdasdasdads
- acสวีbดัส
- abc
- สวัสดีดี
- asdadsasdasddsaasdasdasdads
- */
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement