Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- -- Given table NEWS with textual column FULL_TEXT
- DROP PROCEDURE IF EXISTS populate_counter;
- DELIMITER //
- CREATE PROCEDURE populate_counter ()
- BEGIN
- DROP TABLE IF EXISTS counter;
- CREATE TABLE counter (
- id int unsigned NOT NULL AUTO_INCREMENT,
- word VARCHAR(8000),
- usageCount int(20),
- primary key (id)
- );
- SET @wordCt = 0;
- SET @tokenCt = 1;
- contentLoop: LOOP
- SET @stmt = 'INSERT INTO counter (word) SELECT REPLACE(SUBSTRING(SUBSTRING_INDEX(`full_text`, " ", ?),
- LENGTH(SUBSTRING_INDEX(`full_text`, " ", ? -1)) + 1),
- " ", "") word
- FROM news
- WHERE LENGTH(SUBSTRING_INDEX(`full_text`, " ", ?)) != LENGTH(`full_text`)';
- PREPARE cmd FROM @stmt;
- EXECUTE cmd USING @tokenCt, @tokenCt, @tokenCt;
- SELECT ROW_COUNT() INTO @wordCt;
- DEALLOCATE PREPARE cmd;
- IF (@wordCt = 0) THEN
- LEAVE contentLoop;
- ELSE
- SET @tokenCt = @tokenCt + 1;
- END IF;
- END LOOP;
- END //
- DELIMITER ;
- call populate_counter();
- -- Now clean up COUNTER by removing common words
- DROP PROCEDURE IF EXISTS common_exclude;
- DELIMITER //
- CREATE PROCEDURE common_exclude ()
- BEGIN
- DECLARE cursor_ID INT;
- DECLARE cursor_VAL VARCHAR(100);
- DECLARE done INT DEFAULT 0;
- DECLARE cursor_i CURSOR FOR
- SELECT id,word FROM common_words;
- DECLARE CONTINUE HANDLER FOR NOT FOUND SET done=1;
- OPEN cursor_i;
- contentLoop: LOOP
- FETCH cursor_i INTO cursor_ID, cursor_VAL;
- IF done THEN
- LEAVE contentLoop;
- END IF;
- DELETE FROM counter WHERE word = cursor_VAL;
- END LOOP;
- CLOSE cursor_i;
- END //
- DELIMITER ;
- call common_exclude();
- DELETE FROM counter WHERE CHAR_LENGTH(word) <= 2;
- DELETE FROM counter WHERE word LIKE '%>%' OR word LIKE '%/%' OR word LIKE '%<%';
- DROP TABLE IF EXISTS top_words;
- CREATE TABLE top_words SELECT word, count(*) usageCount FROM counter GROUP BY word ORDER BY usageCount DESC;
- SELECT * FROM top_words;
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement