Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- using System.Text.Json;
- using System.Text.Json.Serialization;
- using System.Text.RegularExpressions;
- using System.Web;
- using System.Xml.Serialization;
- public static class Scheduler
- {
- public static void Process<T>(IEnumerable<T> ie, int delay, Func<T, Task<bool>> job)
- {
- const int concurrency = 4;
- var count = 0;
- var total = ie.Count();
- var mutex = new object();
- void ReportProgress(int current)
- {
- lock (mutex)
- {
- if (current == count)
- {
- var progress = Math.Round((float)current / total * 100, 2);
- Console.WriteLine($"{current}/{total}: {progress}%");
- }
- }
- }
- Parallel.ForEach(ie, new ParallelOptions() { MaxDegreeOfParallelism = concurrency }, x =>
- {
- var didWork = true;
- try
- {
- didWork = job(x).GetAwaiter().GetResult();
- }
- catch (Exception _)
- {
- Console.WriteLine($"Task failed :(");
- }
- finally
- {
- ReportProgress(Interlocked.Increment(ref count));
- if (didWork) Thread.Sleep(delay);
- }
- });
- }
- }
- public struct Post
- {
- public string URL;
- public string Filename;
- public IEnumerable<string> Tags;
- }
- public interface IBooru
- {
- public string ConstructURL(int page, int limit, IEnumerable<string> tags);
- public Task<IEnumerable<Post>> GetPosts(int page, int limit, IEnumerable<string> tags);
- }
- public class Gelbooru : IBooru
- {
- public class DataContract
- {
- [XmlRoot(ElementName = "post")]
- public class Post
- {
- [XmlElement(ElementName = "tags")]
- public string Tags { get; set; }
- [XmlElement(ElementName = "file_url")]
- public string File_url { get; set; }
- }
- [XmlRoot(ElementName = "posts")]
- public class Posts
- {
- [XmlElement(ElementName = "post")]
- public List<Post>? Post { get; set; }
- [XmlAttribute(AttributeName = "limit")]
- public int Limit { get; set; }
- [XmlAttribute(AttributeName = "offset")]
- public int Offset { get; set; }
- [XmlAttribute(AttributeName = "count")]
- public int Count { get; set; }
- }
- }
- private HttpClient _hc = new HttpClient();
- private XmlSerializer _xml = new XmlSerializer(typeof(DataContract.Posts));
- public Gelbooru(HttpClient hc)
- {
- _hc = hc;
- }
- public string ConstructURL(int page, int limit, IEnumerable<string> tags)
- {
- var encodedTags = HttpUtility.UrlEncode(string.Join(" ", tags));
- return $"https://gelbooru.com/index.php?page=dapi&s=post&q=index&limit={limit}&pid={page}&tags={encodedTags}";
- }
- public async Task<IEnumerable<Post>> GetPosts(int page, int limit, IEnumerable<string> tags)
- {
- var get = await _hc.GetAsync(ConstructURL(page, limit, tags));
- var content = await get.Content.ReadAsStringAsync();
- using (var sr = new StringReader(content))
- return ((DataContract.Posts?)_xml.Deserialize(sr)).Post.Select(x => new Post() { URL = x.File_url, Tags = x.Tags.Split(" ") });
- }
- }
- public class Danbooru : IBooru
- {
- public class DataContract
- {
- public class Post
- {
- [JsonPropertyName("tag_string")]
- public string Tags { get; set; }
- [JsonPropertyName("tag_string_general")]
- public string TagsGeneral { get; set; }
- [JsonPropertyName("file_url")]
- public string File_url { get; set; }
- }
- }
- private HttpClient _hc = new HttpClient();
- public Danbooru(HttpClient hc)
- {
- _hc = hc;
- }
- public string ConstructURL(int page, int limit, IEnumerable<string> tags)
- {
- var encodedTags = HttpUtility.UrlEncode(string.Join(" ", tags));
- return $"https://danbooru.donmai.us/posts.json?limit={limit}&page={page}&tags={encodedTags}";
- }
- public async Task<IEnumerable<Post>> GetPosts(int page, int limit, IEnumerable<string> tags)
- {
- var get = await _hc.GetAsync(ConstructURL(page, limit, tags));
- var content = await get.Content.ReadAsStringAsync();
- return JsonSerializer.Deserialize<DataContract.Post[]>(content)
- .Select(x => new Post() {
- URL = x.File_url,
- Tags = x.Tags.Split(" "),
- Filename = x.File_url?.Split("/")?.Last()
- });
- }
- }
- public class Entrypoint
- {
- static void RemoveCaptionsForDeleted(string directory)
- {
- var extensions = new[] { ".png", ".bmp", ".gif", ".jpg", ".jpeg" };
- foreach (var f in new DirectoryInfo(directory).GetFiles("*.txt"))
- {
- var extensionless = Path.GetFileNameWithoutExtension(Path.GetFileNameWithoutExtension(f.FullName));
- var exists = false;
- foreach (var ext in extensions)
- if (File.Exists(Path.Combine(directory, extensionless + ext)))
- exists = true;
- if (!exists)
- File.Delete(f.FullName);
- }
- }
- static void CopyCaptionsForProcessed(string captionsDir, string processedDir)
- {
- var extensions = new[] { ".png", ".bmp", ".gif", ".jpg", ".jpeg" };
- foreach (var f in new DirectoryInfo(processedDir).GetFiles("*.png"))
- {
- var match = Regex.Match(f.Name, @"\d+-\d+-([^\.]+)\.png");
- if (!match.Success) continue;
- var origName = match.Groups[1];
- var origCaptions = Path.Combine(captionsDir, origName + ".txt");
- if (File.Exists(origCaptions))
- File.Copy(origCaptions, Path.Combine(processedDir, Path.GetFileNameWithoutExtension(f.Name) + ".txt"), true);
- }
- }
- static void DumpUniqueCaptions(string captionsDir)
- {
- var allCaptions = new HashSet<string>();
- foreach (var f in new DirectoryInfo(captionsDir).GetFiles("*.txt"))
- {
- var captions = File.ReadAllText(f.FullName).Split(", ");
- foreach (var caption in captions)
- allCaptions.Add(caption);
- }
- File.WriteAllLines("captions.txt", allCaptions);
- }
- static void FilterCaptions(string captionsDir)
- {
- var allCaptions = File.ReadAllLines("captions.txt");
- foreach (var f in new DirectoryInfo(captionsDir).GetFiles("*.txt"))
- {
- var captions = File.ReadAllText(f.FullName).Split(", ");
- var filtered = captions
- .Intersect(allCaptions)
- .Select(x => PoorMansTagCleaner(x))
- .ToArray();
- File.WriteAllText(f.FullName, string.Join(", ", filtered));
- }
- }
- static string PoorMansTagCleaner(string tag)
- {
- return tag
- .Replace("_", " ")
- .Replace("\\(", "(").Replace("\\)", ")")
- .Replace("(", "\\(").Replace(")", "\\)");
- }
- private static HttpClient _hc = new HttpClient();
- public static void Main(string[] args)
- {
- const string output_dir = "OUT";
- if (!Directory.Exists(output_dir))
- Directory.CreateDirectory(output_dir);
- //DumpUniqueCaptions(@"X:\1");
- //FilterCaptions(@"X:\1");
- //CopyCaptionsForProcessed(
- // @"X:\1",
- // @"X:\2"
- // );
- //foreach (var t in new DirectoryInfo(@"X:\1").GetFiles("*.txt"))
- //{
- // var name = Path.GetFileNameWithoutExtension(Path.GetFileNameWithoutExtension(t.FullName));
- // File.Move(t.FullName, Path.Combine(t.Directory.FullName, name + ".txt"));
- //}
- //FilterCaptions(@"X:\1");
- //RemoveCaptionsForDeleted(@"X:\1");
- //return;
- var tags = new[] { "midna", "some_other_tag" };
- var mre = new ManualResetEvent(false);
- IBooru booru = new Danbooru(_hc);
- Task.Factory.StartNew(async () =>
- {
- var page = 0;
- var postsPerPage = 20;
- while(true)
- {
- Console.WriteLine($"Fetching page {page + 1}");
- var posts = await booru.GetPosts(page, postsPerPage, tags);
- Scheduler.Process(posts, 2000, async post =>
- {
- var didWork = false;
- var imagePath = Path.Combine(output_dir, post.Filename);
- if (!File.Exists(imagePath))
- {
- didWork = true;
- var data = await _hc.GetByteArrayAsync(post.URL);
- File.WriteAllBytes(imagePath, data);
- }
- var tagPath = Path.Combine(output_dir, $"{Path.GetFileNameWithoutExtension(post.Filename)}.txt");
- if (!File.Exists(tagPath))
- {
- File.WriteAllText(tagPath, string.Join(", ", post.Tags.Select(PoorMansTagCleaner)));
- }
- return didWork;
- });
- if (posts.Count() != postsPerPage)
- break;
- await Task.Delay(2000);
- page++;
- }
- mre.Set();
- });
- mre.WaitOne();
- }
- }
Add Comment
Please, Sign In to add comment