Guest User

4chan archiver procedures

a guest
Oct 20th, 2021
380
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
T-SQL 12.56 KB | None | 0 0
  1. USE [4ch]
  2. GO
  3.  
  4. /****** Object:  StoredProcedure [dbo].[process_data]    Script Date: 20.10.2021 18:41:12 ******/
  5. SET ANSI_NULLS ON
  6. GO
  7.  
  8. SET QUOTED_IDENTIFIER ON
  9. GO
  10.  
  11. CREATE PROCEDURE [dbo].[process_data]
  12. @board_short VARCHAR(4)
  13. AS
  14. BEGIN
  15. DECLARE @webpage VARCHAR(MAX)
  16. DECLARE @webpage_len INT
  17. DECLARE @thread_id BIGINT
  18. DECLARE @post_id BIGINT
  19. DECLARE @file_id BIGINT
  20. DECLARE @file_link VARCHAR(255)
  21. DECLARE @file_name VARCHAR(4000)
  22. DECLARE @md5 VARCHAR(50)
  23. DECLARE @post_message VARCHAR(4000)
  24. DECLARE @thread_subject VARCHAR(4000)
  25. DECLARE @quote VARCHAR(4000)
  26. DECLARE @post_table TABLE ([post_id] BIGINT,[thread_id] BIGINT,[file_id] BIGINT,[board_short] VARCHAR(4),[post_message] VARCHAR(4000))
  27. DECLARE @thread_table TABLE ([thread_id] BIGINT,[thread_subject] VARCHAR(4000))
  28. DECLARE @file_table TABLE ([file_id] BIGINT,[file_link] VARCHAR(255),[file_name] VARCHAR(4000),[md5] VARCHAR(50))
  29. SET @webpage = (SELECT [webpage]FROM [4ch].[dbo].[webpage])
  30. SET @webpage_len = LEN (@webpage)
  31. DECLARE @counter_1 INT
  32.  
  33. WHILE (CHARINDEX('div class="thread" id="t',@webpage) <> 0)
  34.     BEGIN
  35.         /*THREAD*/
  36.         SET @webpage = SUBSTRING(@webpage,CHARINDEX('div class="thread" id="t',@webpage)+24,@webpage_len)
  37.         SET @thread_id = REPLACE(SUBSTRING(@webpage,1,CHARINDEX('"',@webpage)),'"','') --THREAD_ID
  38.         WHILE   (CHARINDEX('blockquote class="postMessage" id="m',@webpage) < CHARINDEX('div class="thread" id="t',@webpage)
  39.                 AND CHARINDEX('blockquote class="postMessage" id="m',@webpage) <> 0)
  40.                 OR
  41.                 (CHARINDEX('div class="thread" id="t',@webpage) = 0
  42.                 AND CHARINDEX('blockquote class="postMessage" id="m',@webpage)<>0)         
  43.             BEGIN  
  44.                 /*THREAD SUBJECT*/
  45.                 IF  CHARINDEX('<span class="subject">',@webpage) <> 0
  46.                     AND (
  47.                         CHARINDEX('<span class="subject">',@webpage) < CHARINDEX('blockquote class="postMessage" id="m',@webpage)
  48.                         OR CHARINDEX('blockquote class="postMessage" id="m',@webpage) = 0
  49.                         )
  50.                     BEGIN
  51.                         SET @webpage = SUBSTRING (@webpage,CHARINDEX('<span class="subject">',@webpage),@webpage_len)
  52.                         SET @thread_subject = REPLACE(SUBSTRING(@webpage,23,CHARINDEX('</span>',SUBSTRING(@webpage,23,@webpage_len))),'<','')
  53.                         SET @webpage = SUBSTRING (@webpage,CHARINDEX('</span>',@webpage),@webpage_len)
  54.                     END
  55.                 /*FILE*/
  56.                 IF  CHARINDEX('div class="file" id="f',@webpage) <> 0
  57.                     AND (
  58.                         CHARINDEX('div class="file" id="f',@webpage) < CHARINDEX('blockquote class="postMessage" id="m',@webpage)
  59.                         OR CHARINDEX('blockquote class="postMessage" id="m',@webpage) = 0
  60.                         )
  61.                     BEGIN
  62.                     SET @webpage = SUBSTRING(@webpage,CHARINDEX('div class="file" id="f',@webpage)+22,@webpage_len)
  63.                     SET @file_id = REPLACE(SUBSTRING(@webpage,1,CHARINDEX('"',@webpage)),'"','')
  64.                     IF  @file_id <> 9112225
  65.                         BEGIN
  66.                             IF  CHARINDEX('class="fileDeletedRes retina"',@webpage) <> 0
  67.                                 AND (
  68.                                     CHARINDEX('class="fileDeletedRes retina"',@webpage) < CHARINDEX('<a title="',@webpage)
  69.                                     OR CHARINDEX('<a title="',@webpage) = 0
  70.                                     )
  71.                                 BEGIN
  72.                                     SET @webpage = SUBSTRING(@webpage,CHARINDEX('class="fileDeletedRes retina"',@webpage),@webpage_len)
  73.                                 END
  74.                             ELSE IF CHARINDEX('<a title="',@webpage) <> 0
  75.                                     AND CHARINDEX('<a title="',@webpage) < CHARINDEX('href="//',@webpage)
  76.                                 BEGIN
  77.                                     SET @webpage = SUBSTRING(@webpage,CHARINDEX('<a title="',@webpage)+10,@webpage_len)
  78.                                     SET @file_name = REPLACE(SUBSTRING(@webpage,1,CHARINDEX('"',@webpage)),'"','')
  79.                                     SET @webpage = SUBSTRING(@webpage,CHARINDEX('href="//',@webpage)+8,@webpage_len)
  80.                                     SET @file_link = REPLACE(SUBSTRING(@webpage,1,CHARINDEX('"',@webpage)),'"','')
  81.                                     SET @webpage = SUBSTRING(@webpage,CHARINDEX('data-md5="',@webpage)+10,@webpage_len)
  82.                                     SET @md5 = REPLACE(SUBSTRING(@webpage,1,CHARINDEX('"',@webpage)),'"','')
  83.                                 END
  84.                             ELSE   
  85.                                 BEGIN
  86.                                     SET @webpage = SUBSTRING(@webpage,CHARINDEX('href="//',@webpage)+8,@webpage_len)
  87.                                     SET @file_link = REPLACE(SUBSTRING(@webpage,1,CHARINDEX('"',@webpage)),'"','')
  88.                                     SET @file_name = REPLACE(SUBSTRING(@webpage,CHARINDEX('>',@webpage),CHARINDEX('<',@webpage)-CHARINDEX('>',@webpage)),'>','')
  89.                                     SET @webpage = SUBSTRING(@webpage,CHARINDEX('data-md5="',@webpage)+10,@webpage_len)
  90.                                     SET @md5 = REPLACE(SUBSTRING(@webpage,1,CHARINDEX('"',@webpage)),'"','')
  91.                                 END
  92.                         END
  93.                     END
  94.                 /*THREAD SUBJECT*/
  95.                 IF  CHARINDEX('<span class="subject">',@webpage) <> 0
  96.                     AND (
  97.                         CHARINDEX('<span class="subject">',@webpage) < CHARINDEX('blockquote class="postMessage" id="m',@webpage)
  98.                         OR CHARINDEX('blockquote class="postMessage" id="m',@webpage) = 0
  99.                         )
  100.                     BEGIN
  101.                         SET @webpage = SUBSTRING (@webpage,CHARINDEX('<span class="subject">',@webpage),@webpage_len)
  102.                         SET @thread_subject = REPLACE(SUBSTRING(@webpage,23,CHARINDEX('</span>',SUBSTRING(@webpage,23,@webpage_len))),'<','')
  103.                         SET @webpage = SUBSTRING (@webpage,CHARINDEX('</span>',@webpage),@webpage_len)
  104.                     END
  105.  
  106.                 /*POST*/
  107.                 SET @webpage = SUBSTRING(@webpage,CHARINDEX('blockquote class="postMessage" id="m',@webpage)+36,@webpage_len)
  108.                 SET @post_id = REPLACE(SUBSTRING(@webpage,1,CHARINDEX('"',@webpage)),'"','')
  109.                 SET @post_message = SUBSTRING(@webpage,CHARINDEX('>',@webpage)+1,CHARINDEX('</blockquote>',@webpage)-CHARINDEX('>',@webpage)-1)
  110.  
  111.                 SET @counter_1 = 0
  112.                 WHILE   (CHARINDEX('<a href="',@post_message) <> 0 AND CHARINDEX('</a>',@post_message) <> 0 AND (CHARINDEX('</a>',@post_message)-CHARINDEX('<a href="',@post_message)+4)>0)
  113.                         AND @counter_1 < 20
  114.                         BEGIN
  115.                             SET @quote = SUBSTRING(@post_message,CHARINDEX('<a href="',@post_message),CHARINDEX('</a>',@post_message)-CHARINDEX('<a href="',@post_message)+4)
  116.                             SET @post_message = REPLACE(@post_message,@quote,'')
  117.                             SET @counter_1 = @counter_1 + 1
  118.                         END
  119.        
  120.                 SET @counter_1 = 0
  121.                 WHILE   (CHARINDEX('<span class="deadlink">',@post_message) <> 0 AND CHARINDEX('</span>',@post_message) <> 0 AND (CHARINDEX('</span>',@post_message)-CHARINDEX('<span class="deadlink">',@post_message)+7) > 0)
  122.                         AND @counter_1 < 20
  123.                         BEGIN
  124.                             SET @quote = SUBSTRING(@post_message,CHARINDEX('<span class="deadlink">',@post_message),CHARINDEX('</span>',@post_message)-CHARINDEX('<span class="deadlink">',@post_message)+7)
  125.                             SET @post_message = REPLACE(@post_message,@quote,'')
  126.                             SET @counter_1 = @counter_1 + 1
  127.                         END
  128.  
  129.                 SET @post_message = LTRIM(RTRIM(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(@post_message,'<span class="quote">&gt;',' '),'</span>',' '),'<span class="deadlink">>>',' '),'<br>',' '),'&gt;','>'),'&lt;','<'),'&#039;',''''),'<span class="abbr">Comment too long.',' '),'&quot;','"'),'&amp;','&'),'<wbr>',''),'  ',' ')))
  130.                 SET @post_message = CASE WHEN @post_message = '' THEN NULL ELSE @post_message END
  131.            
  132.                 IF  @post_id = @thread_id AND (@thread_subject = '' OR @thread_subject IS NULL)
  133.                     BEGIN
  134.                         SET @thread_subject = @post_message
  135.                     END
  136.  
  137.                 SET @thread_subject = LTRIM(RTRIM(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(@thread_subject,'<span class="quote">&gt;',' '),'</span>',' '),'<span class="deadlink">>>',' '),'<br>',' '),'&gt;','>'),'&lt;','<'),'&#039;',''''),'<span class="abbr">Comment too long.',' '),'&quot;','"'),'&amp;','&'),'<wbr>',''),'  ',' ')))
  138.                 SET @thread_subject = CASE WHEN @thread_subject = '' THEN NULL ELSE @thread_subject END
  139.  
  140.                 INSERT INTO @post_table ([post_id],[thread_id],[file_id],[post_message])
  141.                 VALUES (@post_id,@thread_id,@file_id,@post_message)
  142.  
  143.                 DELETE FROM @thread_table WHERE [thread_id] = @thread_id
  144.                 INSERT INTO @thread_table ([thread_id],[thread_subject])
  145.                 VALUES (@thread_id,@thread_subject)
  146.  
  147.                 IF  @file_id IS NOT NULL
  148.                     BEGIN
  149.                         INSERT INTO @file_table ([file_id],[file_link],[file_name],[md5])
  150.                         VALUES (@file_id,@file_link,@file_name,@md5)
  151.                     END
  152.  
  153.                 SET @post_id = NULL
  154.                 SET @file_id = NULL
  155.                 SET @file_link = NULL
  156.                 SET @file_name = NULL
  157.                 SET @md5 = NULL
  158.             END
  159.     END
  160.  
  161.     SET @counter_1 = 1
  162.     DECLARE @max_counter INT
  163.     DECLARE @sentence_current VARCHAR(4000)
  164.     DECLARE @words_table TABLE ([post_message] VARCHAR(4000),[RN] INT,UNIQUE CLUSTERED ([RN]))
  165.     INSERT INTO @words_table ([post_message],[RN])
  166.     SELECT A.[post_message],ROW_NUMBER() OVER (ORDER BY A.[post_message]) AS [RN] FROM @post_table A LEFT JOIN [4ch].[dbo].[post] B ON A.[post_id] = B.[post_id] WHERE B.[post_id] IS NULL AND A.[post_message] <> ''
  167.     SET @max_counter = (SELECT MAX([RN]) FROM @words_table)
  168.     WHILE @counter_1 < @max_counter
  169.         BEGIN
  170.             SET @sentence_current = (SELECT [post_message] FROM @words_table WHERE [RN] = @counter_1)
  171.             EXECUTE [dbo].[words_counter] @sentence = @sentence_current
  172.             SET @counter_1 = @counter_1 + 1
  173.         END
  174.  
  175.     SET @counter_1 = 1
  176.     DECLARE @words_table_2 TABLE ([thread_subject] VARCHAR(4000),[RN] INT,UNIQUE CLUSTERED ([RN]))
  177.     INSERT INTO @words_table_2 ([thread_subject],[RN])
  178.     SELECT A.[thread_subject],ROW_NUMBER() OVER (ORDER BY A.[thread_subject]) AS [RN] FROM @thread_table A LEFT JOIN [4ch].[dbo].[thread] B ON A.[thread_id] = B.[thread_id] WHERE B.[thread_id] IS NULL AND A.[thread_subject] <> ''
  179.     SET @max_counter = (SELECT MAX([RN]) FROM @words_table_2)
  180.     WHILE @counter_1 < @max_counter
  181.         BEGIN
  182.             SET @sentence_current = (SELECT [thread_subject] FROM @words_table_2 WHERE [RN] = @counter_1)
  183.             EXECUTE [dbo].[words_counter] @sentence = @sentence_current
  184.             SET @counter_1 = @counter_1 + 1
  185.         END
  186.  
  187.     INSERT INTO [4ch].[dbo].[post] ([post_id],[thread_id],[file_id],[post_message],[board_short])
  188.     SELECT A.[post_id],A.[thread_id],A.[file_id],A.[post_message],@board_short FROM @post_table A LEFT JOIN [4ch].[dbo].[post] B ON A.[post_id] = B.[post_id] WHERE B.[post_id] IS NULL
  189.  
  190.     INSERT INTO [4ch].[dbo].[thread] ([thread_id],[thread_subject])
  191.     SELECT A.[thread_id],A.[thread_subject] FROM @thread_table A LEFT JOIN [4ch].[dbo].[thread] B ON A.[thread_id] = B.[thread_id] WHERE B.[thread_id] IS NULL
  192.  
  193.     INSERT INTO [4ch].[dbo].[file] ([file_id],[file_link],[file_name],[md5])
  194.     SELECT A.[file_id],A.[file_link],A.[file_name],A.[md5] FROM @file_table A LEFT JOIN [4ch].[dbo].[file] B ON A.[file_id] = B.[file_id] WHERE B.[file_id] IS NULL
  195. END
  196. GO
  197.  
  198.  
  199.  
  200.  
  201.  
  202.  
  203.  
  204.  
  205.  
  206.  
  207. USE [4ch]
  208. GO
  209.  
  210. /****** Object:  StoredProcedure [dbo].[words_counter]    Script Date: 20.10.2021 18:41:22 ******/
  211. SET ANSI_NULLS ON
  212. GO
  213.  
  214. SET QUOTED_IDENTIFIER ON
  215. GO
  216.  
  217. CREATE PROCEDURE [dbo].[words_counter]
  218. @sentence VARCHAR(4000)
  219. AS
  220. BEGIN
  221.     DECLARE @word VARCHAR(255)
  222.     DECLARE @sentence_len INT
  223.     DECLARE @space_index INT
  224.     DECLARE @words_table TABLE ([word] VARCHAR(255),[counter] INT,UNIQUE CLUSTERED ([word],[counter]))
  225.     SET @sentence = LTRIM(RTRIM(@sentence))
  226.     SET @sentence_len = LEN(@sentence)
  227.     SET @space_index = CHARINDEX(' ',@sentence)
  228.     WHILE   @space_index <> 0
  229.             BEGIN
  230.                 SET @word = RTRIM(LEFT(SUBSTRING(@sentence,1,@space_index),255))
  231.                 WHILE @word NOT LIKE '%[0-9A-Ż]' AND @word <> ''
  232.                     BEGIN
  233.                         SET @word = REVERSE(SUBSTRING(REVERSE(@word),2,@sentence_len))
  234.                     END
  235.                 WHILE @word NOT LIKE '[0-9A-Ż]%' AND @word <> ''
  236.                     BEGIN
  237.                         SET @word = SUBSTRING(@word,2,@sentence_len)
  238.                     END
  239.                 SET @sentence = LTRIM(SUBSTRING(@sentence,@space_index,@sentence_len))
  240.                 IF  @word IN (SELECT [word] FROM @words_table WHERE [word] IS NOT NULL) AND @word <> ''
  241.                     BEGIN
  242.                         UPDATE A
  243.                         SET [counter] = [counter] + 1
  244.                         FROM @words_table A
  245.                         WHERE [word] = @word
  246.                     END
  247.                 ELSE IF @word <> ''
  248.                     BEGIN
  249.                         INSERT INTO @words_table ([word],[counter])
  250.                         VALUES (@word,1)
  251.                     END
  252.                 SET @space_index = CHARINDEX(' ',@sentence)
  253.             END
  254.    
  255.     IF  @space_index = 0
  256.         BEGIN
  257.             SET @word = LTRIM(@sentence)
  258.             WHILE @word NOT LIKE '%[0-9A-Ż]' AND @word <> ''
  259.                 BEGIN
  260.                     SET @word = REVERSE(SUBSTRING(REVERSE(@word),2,@sentence_len))
  261.                 END
  262.             WHILE @word NOT LIKE '[0-9A-Ż]%' AND @word <> ''
  263.                 BEGIN
  264.                     SET @word = SUBSTRING(@word,2,@sentence_len)
  265.                 END
  266.             IF  @word IN (SELECT [word] FROM @words_table WHERE [word] IS NOT NULL) AND @word <> ''
  267.                 BEGIN
  268.                     UPDATE A
  269.                     SET [counter] = [counter] + 1
  270.                     FROM @words_table A
  271.                     WHERE [word] = @word
  272.                 END
  273.             ELSE IF @word <> ''
  274.                 BEGIN
  275.                     INSERT INTO @words_table ([word],[counter])
  276.                     VALUES (@word,1)
  277.                 END
  278.         END
  279.  
  280.     UPDATE A
  281.     SET [counter] = A.[counter] + B.[counter]
  282.     FROM [4ch].[dbo].[word] A JOIN @words_table B ON A.word = B.word
  283.  
  284.     INSERT INTO [4ch].[dbo].[word] ([word],[counter])
  285.     SELECT
  286.     A.[word]
  287.     ,A.[counter]
  288.     FROM @words_table A LEFT JOIN [4ch].[dbo].[word] B ON A.[word] = B.[word]
  289.     WHERE
  290.     B.[word] IS NULL
  291. END
  292. GO
  293.  
  294.  
  295.  
Advertisement
Add Comment
Please, Sign In to add comment