Advertisement
Guest User

Untitled

a guest
Apr 27th, 2023
15,905
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
PHP 12.30 KB | None | 0 0
  1. <?php
  2. /*
  3.  23.04.2023, Archived.Moe's shitty php downloading script
  4.  Just keep the correct download format like /dataPath/{board}/{first 4 digits}/{next 2 digits}/{full chan_image_name}{?thumb?}{.EXT}
  5.  example:
  6.  chan_image_name = 1574977344130.jpg (for the first 10mil or so rows it's a mess, see script bellow)
  7.  board = b
  8.  image is to be saved in /b/image/1574/97/1574977344130.jpg
  9.  thumb is to be saved in /b/thumb/1574/97/1574977344130s.jpg
  10. */
  11. /* CONFIGS */
  12. $data_path = "/var/www/4archive/data/"; //  /var/www ...
  13. $download_success = false; //stupid way of tracking status
  14. $dbhost = "localhost";
  15. $dbuser = "";
  16. $dbpass = "";
  17. $maindb = "4archive";
  18. /* CONFIGS */
  19.  
  20. // Connect to the database
  21. $conn = mysqli_connect($dbhost, $dbuser, $dbpass, $maindb);
  22. // Check connection
  23. if (!$conn) {
  24.     die("Connection failed: " . mysqli_connect_error());
  25. }
  26.  
  27. /* MAIN */
  28. main();
  29. /* MAIN */
  30.  
  31. echo "\n \t FINISHED DOWNLOADING IMAGES FROM IMGUR ".PHP_EOL;
  32.  
  33. function main()
  34. {
  35.     global $conn, $data_path, $download_success;
  36.     $count = 0;
  37.    
  38.     $sqlThreads = " SELECT * FROM threads LIMIT 300000 OFFSET 30000";
  39.     $resultThreads = mysqli_query($conn, $sqlThreads);
  40.    
  41.     while ($board_row = mysqli_fetch_assoc($resultThreads))
  42.     {
  43.         $board = $board_row['board'];
  44.         $thread_row_id = $board_row['id'];
  45.                    
  46.         // Select data from image_urls table
  47.         $sql = " SELECT id, threads_id, chan_id,original_image_name,chan_image_name,image_url ,downloaded,img_removed  
  48.                            FROM posts
  49.                            WHERE threads_id = $thread_row_id AND
  50.                            image_url LIKE '%imgur%' AND (downloaded IS NULL AND img_removed IS NULL) ";
  51.         $result = mysqli_query($conn, $sql);
  52.      
  53.         // Loop through the data
  54.         while ($row = mysqli_fetch_assoc($result))
  55.         {
  56.             $id = $row['id'];
  57.             $threads_id = $row['threads_id'];
  58.             $original_image_name = $row['original_image_name'];
  59.             $chan_image_name = $row['chan_image_name'];
  60.             $image_url = str_replace( "http:", "https:", $row['image_url']);
  61.             $getUrlExtension = explode('.', $image_url);
  62.             $thumb_url = generateThumbUrl($image_url);
  63.             $downloaded = $row['downloaded'];
  64.             $img_removed = $row['img_removed'];
  65.             //for the first few years of archiving images do not have MD5 value.
  66.             //There will be a lot of duplicate downloads.
  67.             // currently the md5 column isn't indexed. a lookup, even if indexed, would still take a lot of time since it's almost billion posts
  68.             //instead handle the dublicated content issues later - for example when all of the data is saved or when moderating content
  69.             //if whoever ends up using this script or the DB - feel free to index/improve the script.
  70.             //$md5 = $row['md5'];
  71.             // Split the file name into two parts, before and after the dot symbol
  72.             $file_parts = explode('.', $chan_image_name);
  73.             // Check if the first part (before the dot) contains exactly 14 numeric characters, otherwise use the chan_id
  74.             $chan_image_name = (preg_match('/^[0-9]{13}$/', $file_parts[0]))  ?  $row['chan_image_name'] : $row['chan_id'] . "." . end($getUrlExtension);
  75.             $allowed_extensions = array('jpg', 'png', 'gif');
  76.             $extension = substr($chan_image_name, -3);
  77.             if (strlen($chan_image_name) > 4 && !in_array($extension, $allowed_extensions)) {
  78.                 $chan_image_name .= '.' . end($getUrlExtension);
  79.             }
  80.             $thumb_chan_image_name = preg_replace('/[^0-9]/', '', $chan_image_name)  . "s.jpg"; //remove any non-numeric characters, append a "s.jpg"
  81.            
  82.            
  83.             //stupid way to avoid indexing column 'downloaded' and speeding up the checks
  84.             if ($downloaded != 1 && $img_removed != 1)
  85.             {
  86.                 $count ++;
  87.                 echo "File count: $count, DB id: $id ".PHP_EOL;
  88.  
  89.                 // // // Query to select board name from threads table
  90.                 //this way is more time consuming, no? or just my ssd is shit
  91.                 // $sql_board = "SELECT board FROM threads WHERE id='$threads_id'";
  92.                 // $result_board = mysqli_query($conn, $sql_board);
  93.  
  94.                 // // Check if the query returned any result
  95.                 // if (mysqli_num_rows($result_board) > 0) {
  96.                     // $board_row = mysqli_fetch_assoc($result_board);
  97.                     // $board = $board_row['board'];
  98.                 // } else {
  99.                     // echo " \t ??? BOARD NOT FOUND for $sql_board ".PHP_EOL;
  100.                     // $board = "tmp";
  101.                 // }
  102.                
  103.                 $firstPart = substr($chan_image_name, 0, 4);
  104.                 $secondPart = substr($chan_image_name, 4, 3);
  105.                 $dirOriginalImage = $data_path . $board ."/image/". $firstPart ."/". $secondPart;
  106.                 $dirThumbImage = $data_path . $board ."/thumb/". $firstPart ."/". $secondPart;
  107.                 $pathWithOriginalImage = $dirOriginalImage . "/". $chan_image_name;
  108.                 $pathWithThumbImage = $dirThumbImage . "/". $thumb_chan_image_name;
  109.  
  110.                 // //or to download only thumbnails -
  111.                 // if( !file_exists($old_thumb_url) )  
  112.                 // only continue if file is not found on disk
  113.                 if( !file_exists($pathWithOriginalImage) )
  114.                 {
  115.                     // Visit image_url and check for redirection
  116.                     $ch = curl_init();
  117.                     curl_setopt($ch, CURLOPT_URL, $image_url);
  118.                     curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
  119.                     curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
  120.                     curl_exec($ch);
  121.                     $redirect_url = curl_getinfo($ch, CURLINFO_EFFECTIVE_URL);
  122.                     curl_close($ch);
  123.  
  124.                     if (strpos($redirect_url, "removed.png") !== false) {
  125.                         echo " File $image_url was removed ".PHP_EOL;
  126.                         // File is removed, update removed column in posts table
  127.                         $sql_update_removed = "UPDATE posts SET img_removed = '1' WHERE id = '$id' ";
  128.                         mysqli_query($conn, $sql_update_removed);
  129.                     } else {
  130.                         // File is not removed, download it and verify
  131.                          //save big file only if it's not a webm, otherwise save only thumbnail
  132.                         if (substr($original_image_name, -4) === 'webm')
  133.                         {
  134.                             //echo "\t\t\t WEBM! ";
  135.                             saveFile($dirThumbImage, $pathWithThumbImage, $thumb_url);
  136.                         }else{
  137.                             //saveFile($dirOriginalImage, $pathWithOriginalImage, $image_url); //save big image
  138.                             saveFile($dirThumbImage, $pathWithThumbImage, $thumb_url);  // save thumb
  139.                         }
  140.  
  141.                         if ($download_success == true) {
  142.                             // File is downloaded, update downloaded column in posts table
  143.                             $sql_update_downloaded = "UPDATE posts SET downloaded = '1' WHERE id = '$id' " ;
  144.                             //mysqli_query($conn, $sql_update_downloaded);
  145.                         } else {
  146.                             // File download failed
  147.                             echo "Failed to download file: {$image_url}".PHP_EOL;
  148.                             sleep(60);
  149.                         }
  150.                     }
  151.                 }else{
  152.                     echo "File exists - $pathWithOriginalImage ".PHP_EOL;
  153.                     $sql_update_downloaded = "UPDATE posts SET downloaded = '1' WHERE id = '$id' " ;
  154.                     mysqli_query($conn, $sql_update_downloaded);
  155.                 }
  156.             }
  157.         }
  158.     }
  159.  
  160.     // Close the database connection
  161.     mysqli_close($conn);
  162. }
  163.  
  164. function saveFile($dir, $imageWithPath, $fileLink)
  165. {
  166.     global $download_success;
  167.     //echo "Saving $fileLink to $imageWithPath ".PHP_EOL;
  168.     //check first if file exists
  169.     if( file_exists($imageWithPath) )
  170.     {
  171.         echo "File exists - $imageWithPath ".PHP_EOL;
  172.         return $download_success = true;
  173.     }
  174.  
  175.     // check if directory exists, create it otherwise.
  176.     if (!file_exists($dir)) {
  177.         mkdir($dir, 0700, true);
  178.         chmod($dir, 0700);
  179.         // chown($dir, 'www-data');
  180.         // chgrp($dir, 'www-data');
  181.     }
  182.  
  183.     $ch = curl_init();
  184.     curl_setopt($ch, CURLOPT_URL, $fileLink);
  185.     curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
  186.     curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
  187.  
  188.     $fp = fopen($imageWithPath, 'w');
  189.     curl_setopt($ch, CURLOPT_FILE, $fp);
  190.  
  191.     $result = curl_exec($ch);
  192.  
  193.     curl_close($ch);
  194.     fclose($fp);
  195.  
  196.     if ($result !== false) {
  197.         if( file_exists($imageWithPath) )
  198.         {
  199.             echo "File $fileLink downloaded successfully to $imageWithPath" . PHP_EOL;
  200.             chmod($imageWithPath, '775'); //so that we can delete the file via web interface?
  201.             // chown($imageWithPath, 'www-data'); //otherwise nginx won't see it?
  202.             // chgrp($imageWithPath, 'www-data');
  203.             return $download_success = true;
  204.         }
  205.         else{
  206.             echo " !!ERROR: file was not downloaded - $fileLink was not saved into $imageWithPath !!!!" .PHP_EOL;
  207.             return $download_success = false;
  208.         }
  209.     } else {
  210.         echo " !!ERROR writing to local file - $fileLink into $imageWithPath  " . PHP_EOL;
  211.         $download_success = false;
  212.         return $download_success = false;
  213.     }
  214. }
  215.  
  216. //via file_put_contents - maybe slightly slower?
  217. function saveFile_old($dir, $imageWithPath, $fileLink)
  218. {
  219.     global $download_success;
  220.     //echo "Saving $fileLink to $imageWithPath ".PHP_EOL;
  221.     //check first if file exists
  222.     if( file_exists($imageWithPath) )
  223.     {
  224.         echo "File exists - $imageWithPath ".PHP_EOL;
  225.         return $download_success = true;
  226.     }
  227.  
  228.     // check if directory exists, create it otherwise.
  229.     if (!file_exists($dir)) {
  230.         mkdir($dir, 0700, true);
  231.         chmod($dir, 0700);
  232.         // chown($dir, 'www-data');
  233.         // chgrp($dir, 'www-data');
  234.     }
  235.  
  236.     $fp = fopen($fileLink, 'r'); // Open the remote file for reading
  237.  
  238.     if ($fp !== false) { // Check if the file was opened successfully
  239.         $bytesWritten = file_put_contents($imageWithPath, $fp); // Write the contents of the remote file to the local file
  240.         fclose($fp); // Close the remote file handle
  241.  
  242.         // Check if the local file was written successfully
  243.         if ($bytesWritten !== false)
  244.         {
  245.             if( file_exists($imageWithPath) )
  246.             {
  247.                 //echo "File $fileLink downloaded successfully to $imageWithPath" . PHP_EOL;
  248.                 chmod($imageWithPath, '775'); //so that we can delete the file via web interface?
  249.                 // chown($imageWithPath, 'www-data'); //otherwise nginx won't see it?
  250.                 // chgrp($imageWithPath, 'www-data');
  251.                 return $download_success = true;
  252.             }
  253.             else{
  254.                 echo " !!ERROR: file was not downloaded - $fileLink was not saved into $imageWithPath !!!!" .PHP_EOL;
  255.                 return $download_success = false;
  256.             }
  257.         } else {
  258.             echo " !!ERROR writing to local file - $fileLink into $imageWithPath  " . PHP_EOL;
  259.             $download_success = false;
  260.             return $download_success = false;
  261.         }
  262.     } else {
  263.         echo " !!ERROR opening remote file - $fileLink  " . PHP_EOL;
  264.         $download_success = false;
  265.         return $download_success = false;
  266.     }
  267. }
  268.  
  269. function generateThumbUrl($image_url)
  270. {
  271.     // Find the position of the last dot in the URL
  272.     $lastDotPos = strrpos($image_url, '.');
  273.    
  274.     if (strpos($image_url, 'webm') == true ) {
  275.         //"The string contains 'webm'.";
  276.         $thumb_url = $image_url;
  277.     } else {
  278.         //"The string does not contain 'webm'.";
  279.         // Insert an "m" before the last dot
  280.         if ($lastDotPos !== false)
  281.             $thumb_url = substr_replace($image_url, 'm', $lastDotPos, 0);
  282.         else
  283.             $thumb_url = $image_url;
  284.     }
  285.     return $thumb_url;
  286. }
  287.  
  288.  
  289.  
  290. ?>
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement