Guest User

Untitled

a guest
Jul 17th, 2018
75
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.49 KB | None | 0 0
  1. #!/usr/bin/perl
  2.  
  3. # stripgutenberg.pl < in.txt > out.txt
  4. #
  5. # designed for piping
  6. # Written by Andrew Dunbar (hippietrail), released into the public domain, Dec 2010
  7.  
  8. use strict;
  9.  
  10. my $debug = 0;
  11.  
  12. my $state = 'beginning';
  13. my $print = 0;
  14. my $printed = 0;
  15.  
  16. while (1) {
  17. $_ = <>;
  18.  
  19. last unless $_;
  20.  
  21. # strip UTF-8 BOM
  22. if ($. == 1 && index($_, "\xef\xbb\xbf") == 0) {
  23. $_ = substr($_, 3);
  24. }
  25.  
  26. if ($state eq 'beginning') {
  27. if (/^(The Project Gutenberg [Ee]Book( of|,)|Project Gutenberg's )/) {
  28. $state = 'normal pg header';
  29. $debug && print "state: beginning -> normal pg header\n";
  30. $print = 0;
  31. } elsif (/^$/) {
  32. $state = 'beginning blanks';
  33. $debug && print "state: beginning -> beginning blanks\n";
  34. } else {
  35. die "unrecognized beginning: $_";
  36. }
  37. } elsif ($state eq 'normal pg header') {
  38. if (/^\*\*\*\ ?START OF TH(IS|E) PROJECT GUTENBERG EBOOK,? /) {
  39. $state = 'end of normal header';
  40. $debug && print "state: normal pg header -> end of normal pg header\n";
  41. } else {
  42. # body of normal pg header
  43. }
  44. } elsif ($state eq 'end of normal header') {
  45. if (/^(Produced by|Transcribed from)/) {
  46. $state = 'post header';
  47. $debug && print "state: end of normal pg header -> post header\n";
  48. } elsif (/^$/) {
  49. # blank lines
  50. } else {
  51. $state = 'etext body';
  52. $debug && print "state: end of normal header -> etext body\n";
  53. $print = 1;
  54. }
  55. } elsif ($state eq 'post header') {
  56. if (/^$/) {
  57. $state = 'blanks after post header';
  58. $debug && print "state: post header -> blanks after post header\n";
  59. } else {
  60. # multiline Produced / Transcribed
  61. }
  62. } elsif ($state eq 'blanks after post header') {
  63. if (/^$/) {
  64. # more blank lines
  65. } else {
  66. $state = 'etext body';
  67. $debug && print "state: blanks after post header -> etext body\n";
  68. $print = 1;
  69. }
  70. } elsif ($state eq 'beginning blanks') {
  71. if (/<!-- #INCLUDE virtual=\"\/include\/ga-books-texth\.html\" -->/) {
  72. $state = 'header include';
  73. $debug && print "state: beginning blanks -> header include\n";
  74. } elsif (/^Title: /) {
  75. $state = 'aus header';
  76. $debug && print "state: beginning blanks -> aus header\n";
  77. } elsif (/^$/) {
  78. # more blanks
  79. } else {
  80. die "unexpected stuff after beginning blanks: $_";
  81. }
  82. } elsif ($state eq 'header include') {
  83. if (/^$/) {
  84. # blanks after header include
  85. } else {
  86. $state = 'aus header';
  87. $debug && print "state: header include -> aus header\n";
  88. }
  89. } elsif ($state eq 'aus header') {
  90. if (/^To contact Project Gutenberg of Australia go to http:\/\/gutenberg\.net\.au$/) {
  91. $state = 'end of aus header';
  92. $debug && print "state: aus header -> end of aus header\n";
  93. } elsif (/^A Project Gutenberg of Australia eBook$/) {
  94. $state = 'end of aus header';
  95. $debug && print "state: aus header -> end of aus header\n";
  96. }
  97. } elsif ($state eq 'end of aus header') {
  98. if (/^((Title|Author): .*)?$/) {
  99. # title, author, or blank line
  100. } else {
  101. $state = 'etext body';
  102. $debug && print "state: end of aus header -> etext body\n";
  103. $print = 1;
  104. }
  105. } elsif ($state eq 'etext body') {
  106. # here's the stuff
  107. if (/^<!-- #INCLUDE virtual="\/include\/ga-books-textf\.html" -->$/) {
  108. $state = 'footer';
  109. $debug && print "state: etext body -> footer\n";
  110. $print = 0;
  111. } elsif (/^(\*\*\* ?)?end of (the )?project/i) {
  112. $state = 'footer';
  113. $debug && print "state: etext body -> footer\n";
  114. $print = 0;
  115. }
  116. } elsif ($state eq 'footer') {
  117. # nothing more of interest
  118. } else {
  119. die "unknown state '$state'";
  120. }
  121.  
  122. if ($print) {
  123. print;
  124. ++$printed;
  125. } else {
  126. $debug && print "## $_";
  127. }
  128. }
Add Comment
Please, Sign In to add comment