Guest User

md2bb.pl

a guest
Sep 19th, 2017
23
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. #!/usr/bin/perl
  2.  
  3. #
  4. # Markdown -- A text-to-HTML conversion tool for web writers
  5. #
  6. # Copyright (c) 2004 John Gruber
  7. # <http://daringfireball.net/projects/markdown/>
  8. #
  9.  
  10.  
  11. package Markdown2bbcode;
  12. require 5.006_000;
  13. use strict;
  14. use warnings;
  15.  
  16. use Digest::MD5 qw(md5_hex);
  17. use vars qw($VERSION);
  18. $VERSION = '1.0.1';
  19. # Tue 14 Dec 2004
  20.  
  21. ## Disabled; causes problems under Perl 5.6.1:
  22. # use utf8;
  23. # binmode( STDOUT, ":utf8" );  # c.f.: http://acis.openlib.org/dev/perl-unicode-struggle.html
  24.  
  25. #
  26. # Global default settings:
  27. #
  28. my $g_tab_width = 4;
  29.  
  30. #
  31. # Globals:
  32. #
  33.  
  34. # Regex to match balanced [brackets]. See Friedl's
  35. # "Mastering Regular Expressions", 2nd Ed., pp. 328-331.
  36. my $g_nested_brackets;
  37. $g_nested_brackets = qr{
  38.     (?>                                 # Atomic matching
  39.        [^\[\]]+                         # Anything other than brackets
  40.      |
  41.        \[
  42.          (??{ $g_nested_brackets })     # Recursive set of nested brackets
  43.        \]
  44.     )*
  45. }x;
  46.  
  47.  
  48. # Table of hash values for escaped characters:
  49. my %g_escape_table;
  50. foreach my $char (split //, '\\`*_{}[]()>#+-.!') {
  51.     $g_escape_table{$char} = md5_hex($char);
  52. }
  53.  
  54.  
  55. # Global hashes, used by various utility routines
  56. my %g_urls;
  57. my %g_titles;
  58. my %g_html_blocks;
  59.  
  60. # Used to track when we're inside an ordered or unordered list
  61. # (see _ProcessListItems() for details):
  62. my $g_list_level = 0;
  63.  
  64. sub start { 1; }
  65. sub story {
  66.     my($pkg, $path, $filename, $story_ref, $title_ref, $body_ref) = @_;
  67.  
  68.     if ( (defined($meta::markup) and ($meta::markup =~ /^\s*markdown\s*$/i))
  69.          ){
  70.             $$body_ref  = Markdown($$body_ref);
  71.      }
  72.      1;
  73. }
  74.  
  75.  
  76. #############################################################################
  77. no warnings 'once';
  78. use warnings;
  79.  
  80. #### Check for command-line switches: #################
  81. my %cli_opts;
  82. use Getopt::Long;
  83. Getopt::Long::Configure('pass_through');
  84. GetOptions(\%cli_opts,
  85.     'version',
  86.     'shortversion',
  87.     'html4tags',
  88. );
  89. if ($cli_opts{'version'}) {     # Version info
  90.     print "\nThis is Markdown, version $VERSION.\n";
  91.     print "Copyright 2004 John Gruber\n";
  92.     print "http://daringfireball.net/projects/markdown/\n\n";
  93.     exit 0;
  94. }
  95. if ($cli_opts{'shortversion'}) {        # Just the version number string.
  96.     print $VERSION;
  97.     exit 0;
  98. }
  99.  
  100. #### Process incoming text: ###########################
  101. my $text;
  102. {
  103.     local $/;               # Slurp the whole file
  104.     $text = <>;
  105. }
  106.  
  107. my @lines = split /^/, (Markdown($text));
  108. foreach my $line (@lines) {
  109.     my $win = qx#xdotool getwindowfocus#;
  110.     chomp $win;
  111.     chomp $line;
  112.  
  113.     my $break = join(' ', 'xdotool', 'windowactivate', '--sync', $win, 'key', 'Return');
  114.     if ($line eq "\n") {
  115.         system $break;
  116.     }
  117.     elsif (substr($line, 0, 2) eq "--") {
  118.         $line =~s/(\W)/\\$1/g;
  119.         my $line = join('', '[code]', $line, '[/code]');
  120.         my $cmd = join(' ', 'xdotool', 'type', '--delay', '0', '--window', $win, $line, "\n" );
  121.         system $cmd;
  122.     }
  123.     else {
  124.         $line =~s/(\W)/\\$1/g;
  125.         my $cmd = join(' ', 'xdotool', 'type', '--delay', '0', '--window', $win, $line, "\n" );
  126.         system $cmd;
  127.     }
  128.         system $break;
  129. }
  130. ##############################################################################
  131.  
  132.  
  133. sub Markdown {
  134. #
  135. # Main function. The order in which other subs are called here is
  136. # essential. Link and image substitutions need to happen before
  137. # _EscapeSpecialChars(), so that any *'s or _'s in the <a>
  138. # and <img> tags get encoded.
  139. #
  140.     my $text = shift;
  141.  
  142.     # Clear the global hashes. If we don't clear these, you get conflicts
  143.     # from other articles when generating a page which contains more than
  144.     # one article (e.g. an index page that shows the N most recent
  145.     # articles):
  146.     %g_urls = ();
  147.     %g_titles = ();
  148.     %g_html_blocks = ();
  149.  
  150.  
  151.     # Standardize line endings:
  152.     $text =~ s{\r\n}{\n}g;  # DOS to Unix
  153.     $text =~ s{\r}{\n}g;    # Mac to Unix
  154.  
  155.     # Make sure $text ends with a couple of newlines:
  156.     $text .= "\n\n";
  157.  
  158.     # Convert all tabs to spaces.
  159.     $text = _Detab($text);
  160.  
  161.     # Strip any lines consisting only of spaces and tabs.
  162.     # This makes subsequent regexen easier to write, because we can
  163.     # match consecutive blank lines with /\n+/ instead of something
  164.     # contorted like /[ \t]*\n+/ .
  165.     $text =~ s/^[ \t]+$//mg;
  166.  
  167.     # Turn block-level HTML blocks into hash entries
  168.     $text = _HashHTMLBlocks($text);
  169.  
  170.     # Strip link definitions, store in hashes.
  171.     $text = _StripLinkDefinitions($text);
  172.  
  173.     $text = _RunBlockGamut($text);
  174.  
  175.     $text = _UnescapeSpecialChars($text);
  176.  
  177.     return $text . "\n";
  178. }
  179.  
  180.  
  181. sub _StripLinkDefinitions {
  182. #
  183. # Strips link definitions from text, stores the URLs and titles in
  184. # hash references.
  185. #
  186.     my $text = shift;
  187.     my $less_than_tab = $g_tab_width - 1;
  188.  
  189.     # Link defs are in the form: ^[id]: url "optional title"
  190.     while ($text =~ s{
  191.                         ^[ ]{0,$less_than_tab}\[(.+)\]: # id = $1
  192.                           [ \t]*
  193.                           \n?               # maybe *one* newline
  194.                           [ \t]*
  195.                         <?(\S+?)>?          # url = $2
  196.                           [ \t]*
  197.                           \n?               # maybe one newline
  198.                           [ \t]*
  199.                         (?:
  200.                             (?<=\s)         # lookbehind for whitespace
  201.                             ["(]
  202.                             (.+?)           # title = $3
  203.                             [")]
  204.                             [ \t]*
  205.                         )?  # title is optional
  206.                         (?:\n+|\Z)
  207.                     }
  208.                     {}mx) {
  209.         $g_urls{lc $1} = _EncodeAmpsAndAngles( $2 );    # Link IDs are case-insensitive
  210.         if ($3) {
  211.             $g_titles{lc $1} = $3;
  212.             $g_titles{lc $1} =~ s/"/&quot;/g;
  213.         }
  214.     }
  215.  
  216.     return $text;
  217. }
  218.  
  219.  
  220. sub _HashHTMLBlocks {
  221.     my $text = shift;
  222.     my $less_than_tab = $g_tab_width - 1;
  223.  
  224.     # Hashify HTML blocks:
  225.     # We only want to do this for block-level HTML tags, such as headers,
  226.     # lists, and tables. That's because we still want to wrap <p>s around
  227.     # "paragraphs" that are wrapped in non-block-level tags, such as anchors,
  228.     # phrase emphasis, and spans. The list of tags we're looking for is
  229.     # hard-coded:
  230.     my $block_tags_a = qr/p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math|ins|del/;
  231.     my $block_tags_b = qr/p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math/;
  232.  
  233.     # First, look for nested blocks, e.g.:
  234.     #   <div>
  235.     #       <div>
  236.     #       tags for inner block must be indented.
  237.     #       </div>
  238.     #   </div>
  239.     #
  240.     # The outermost tags must start at the left margin for this to match, and
  241.     # the inner nested divs must be indented.
  242.     # We need to do this before the next, more liberal match, because the next
  243.     # match will start at the first `<div>` and stop at the first `</div>`.
  244.     $text =~ s{
  245.                 (                       # save in $1
  246.                     ^                   # start of line  (with /m)
  247.                     <($block_tags_a)    # start tag = $2
  248.                     \b                  # word break
  249.                     (.*\n)*?            # any number of lines, minimally matching
  250.                     </\2>               # the matching end tag
  251.                     [ \t]*              # trailing spaces/tabs
  252.                     (?=\n+|\Z)  # followed by a newline or end of document
  253.                 )
  254.             }{
  255.                 my $key = md5_hex($1);
  256.                 $g_html_blocks{$key} = $1;
  257.                 "\n\n" . $key . "\n\n";
  258.             }egmx;
  259.  
  260.  
  261.     #
  262.     # Now match more liberally, simply from `\n<tag>` to `</tag>\n`
  263.     #
  264.     $text =~ s{
  265.                 (                       # save in $1
  266.                     ^                   # start of line  (with /m)
  267.                     <($block_tags_b)    # start tag = $2
  268.                     \b                  # word break
  269.                     (.*\n)*?            # any number of lines, minimally matching
  270.                     .*</\2>             # the matching end tag
  271.                     [ \t]*              # trailing spaces/tabs
  272.                     (?=\n+|\Z)  # followed by a newline or end of document
  273.                 )
  274.             }{
  275.                 my $key = md5_hex($1);
  276.                 $g_html_blocks{$key} = $1;
  277.                 "\n\n" . $key . "\n\n";
  278.             }egmx;
  279.     # Special case just for <hr />. It was easier to make a special case than
  280.     # to make the other regex more complicated.
  281.     $text =~ s{
  282.                 (?:
  283.                     (?<=\n\n)       # Starting after a blank line
  284.                     |               # or
  285.                     \A\n?           # the beginning of the doc
  286.                 )
  287.                 (                       # save in $1
  288.                     [ ]{0,$less_than_tab}
  289.                     <(hr)               # start tag = $2
  290.                     \b                  # word break
  291.                     ([^<>])*?           #
  292.                     /?>                 # the matching end tag
  293.                     [ \t]*
  294.                     (?=\n{2,}|\Z)       # followed by a blank line or end of document
  295.                 )
  296.             }{
  297.                 my $key = md5_hex($1);
  298.                 $g_html_blocks{$key} = $1;
  299.                 "\n\n" . $key . "\n\n";
  300.             }egx;
  301.  
  302.     # Special case for standalone HTML comments:
  303.     $text =~ s{
  304.                 (?:
  305.                     (?<=\n\n)       # Starting after a blank line
  306.                     |               # or
  307.                     \A\n?           # the beginning of the doc
  308.                 )
  309.                 (                       # save in $1
  310.                     [ ]{0,$less_than_tab}
  311.                     (?s:
  312.                         <!
  313.                         (--.*?--\s*)+
  314.                         >
  315.                     )
  316.                     [ \t]*
  317.                     (?=\n{2,}|\Z)       # followed by a blank line or end of document
  318.                 )
  319.             }{
  320.                 my $key = md5_hex($1);
  321.                 $g_html_blocks{$key} = $1;
  322.                 "\n\n" . $key . "\n\n";
  323.             }egx;
  324.  
  325.  
  326.     return $text;
  327. }
  328.  
  329.  
  330. sub _RunBlockGamut {
  331. #
  332. # These are all the transformations that form block-level
  333. # tags like paragraphs, headers, and list items.
  334. #
  335.     my $text = shift;
  336.  
  337.     $text = _DoHeaders($text);
  338.  
  339.     $text = _DoLists($text);
  340.  
  341.     $text = _DoCodeBlocks($text);
  342.  
  343.     $text = _DoBlockQuotes($text);
  344.  
  345.     # We already ran _HashHTMLBlocks() before, in Markdown(), but that
  346.     # was to escape raw HTML in the original Markdown source. This time,
  347.     # we're escaping the markup we've just created, so that we don't wrap
  348.     # <p> tags around block-level tags.
  349.     $text = _HashHTMLBlocks($text);
  350.  
  351.     $text = _FormParagraphs($text);
  352.  
  353.     return $text;
  354. }
  355.  
  356.  
  357. sub _RunSpanGamut {
  358. #
  359. # These are all the transformations that occur *within* block-level
  360. # tags like paragraphs, headers, and list items.
  361. #
  362.     my $text = shift;
  363.  
  364.     $text = _DoCodeSpans($text);
  365.  
  366.     $text = _EscapeSpecialChars($text);
  367.  
  368.     # Process anchor and image tags. Images must come first,
  369.     # because ![foo][f] looks like an anchor.
  370.     $text = _DoImages($text);
  371.     $text = _DoAnchors($text);
  372.  
  373.     # Make links out of things like `<http://example.com/>`
  374.     # Must come after _DoAnchors(), because you can use < and >
  375.     # delimiters in inline links like [this](<url>).
  376.     $text = _DoAutoLinks($text);
  377.  
  378.     $text = _EncodeAmpsAndAngles($text);
  379.  
  380.     $text = _DoItalicsAndBold($text);
  381.  
  382.     # Do hard breaks:
  383.     $text =~ s/ {2,}\n/ \n/g;
  384.  
  385.     return $text;
  386. }
  387.  
  388.  
  389. sub _EscapeSpecialChars {
  390.     my $text = shift;
  391.     my $tokens ||= _TokenizeHTML($text);
  392.  
  393.     $text = '';   # rebuild $text from the tokens
  394. #   my $in_pre = 0;  # Keep track of when we're inside <pre> or <code> tags.
  395. #   my $tags_to_skip = qr!<(/?)(?:pre|code|kbd|script|math)[\s>]!;
  396.  
  397.     foreach my $cur_token (@$tokens) {
  398.         if ($cur_token->[0] eq "tag") {
  399.             # Within tags, encode * and _ so they don't conflict
  400.             # with their use in Markdown for italics and strong.
  401.             # We're replacing each such character with its
  402.             # corresponding MD5 checksum value; this is likely
  403.             # overkill, but it should prevent us from colliding
  404.             # with the escape values by accident.
  405.             $cur_token->[1] =~  s! \* !$g_escape_table{'*'}!gx;
  406.             $cur_token->[1] =~  s! _  !$g_escape_table{'_'}!gx;
  407.             $text .= $cur_token->[1];
  408.         } else {
  409.             my $t = $cur_token->[1];
  410.             $t = _EncodeBackslashEscapes($t);
  411.             $text .= $t;
  412.         }
  413.     }
  414.     return $text;
  415. }
  416.  
  417.  
  418. sub _DoAnchors {
  419. #
  420. # Turn Markdown link shortcuts into XHTML <a> tags.
  421. #
  422.     my $text = shift;
  423.  
  424.     #
  425.     # First, handle reference-style links: [link text] [id]
  426.     #
  427.     $text =~ s{
  428.         (                   # wrap whole match in $1
  429.           \[
  430.             ($g_nested_brackets)    # link text = $2
  431.           \]
  432.           [ ]?              # one optional space
  433.           (?:\n[ ]*)?       # one optional newline followed by spaces
  434.           \[
  435.             (.*?)       # id = $3
  436.           \]
  437.         )
  438.     }{
  439.         my $result;
  440.         my $whole_match = $1;
  441.         my $link_text   = $2;
  442.         my $link_id     = lc $3;
  443.         if ($link_id eq "") {
  444.             $link_id = lc $link_text;     # for shortcut links like [this][].
  445.         }
  446.         if (defined $g_urls{$link_id}) {
  447.             my $url = $g_urls{$link_id};
  448.             $url =~ s! \* !$g_escape_table{'*'}!gx;     # We've got to encode these to avoid
  449.             $url =~ s!  _ !$g_escape_table{'_'}!gx;     # conflicting with italics/bold.
  450.             $result = "\[url=\"$url\"\]$link_text\[/url\]";
  451.         }
  452.         else {
  453.             $result = $whole_match;
  454.         }
  455.         $result;
  456.     }xsge;
  457.  
  458.     #
  459.     # Next, inline-style links: [link text](url "optional title")
  460.     #
  461.     $text =~ s{
  462.         (               # wrap whole match in $1
  463.           \[
  464.             ($g_nested_brackets)    # link text = $2
  465.           \]
  466.           \(            # literal paren
  467.             [ \t]*
  468.             <?(.*?)>?   # href = $3
  469.             [ \t]*
  470.             (           # $4
  471.               (['"])    # quote char = $5
  472.               (.*?)     # Title = $6
  473.               \5        # matching quote
  474.             )?          # title is optional
  475.           \)
  476.         )
  477.     }{
  478.         my $result;
  479.         my $whole_match = $1;
  480.         my $link_text   = $2;
  481.         my $url         = $3;
  482.         $url =~ s! \* !$g_escape_table{'*'}!gx;     # We've got to encode these to avoid
  483.         $url =~ s!  _ !$g_escape_table{'_'}!gx;     # conflicting with italics/bold.
  484.         $result = "\[url=\"$url\"\]$link_text\[/url\]";
  485.         $result;
  486.     }xsge;
  487.  
  488.     return $text;
  489. }
  490.  
  491.  
  492. sub _DoImages {
  493. #
  494. # Turn Markdown image shortcuts into <img> tags.
  495. #
  496.     my $text = shift;
  497.  
  498.     #
  499.     # First, handle reference-style labeled images: ![alt text][id]
  500.     #
  501.     $text =~ s{
  502.         (               # wrap whole match in $1
  503.           !\[
  504.             (.*?)       # alt text = $2
  505.           \]
  506.           [ ]?              # one optional space
  507.           (?:\n[ ]*)?       # one optional newline followed by spaces
  508.           \[
  509.             (.*?)       # id = $3
  510.           \]
  511.         )
  512.     }{
  513.         my $result;
  514.         my $whole_match = $1;
  515.         my $alt_text    = $2;
  516.         my $link_id     = lc $3;
  517.         if ($link_id eq "") {
  518.             $link_id = lc $alt_text;     # for shortcut links like ![this][].
  519.         }
  520.         $alt_text =~ s/"/&quot;/g;
  521.         if (defined $g_urls{$link_id}) {
  522.             my $url = $g_urls{$link_id};
  523.             $url =~ s! \* !$g_escape_table{'*'}!gx;     # We've got to encode these to avoid
  524.             $url =~ s!  _ !$g_escape_table{'_'}!gx;     # conflicting with italics/bold.
  525.             $result = "\[img=\"$alt_text\"\]$url\[/img\]";
  526.         }
  527.         else {
  528.             # If there's no such link ID, leave intact:
  529.             $result = $whole_match;
  530.         }
  531.         $result;
  532.     }xsge;
  533.  
  534.     #
  535.     # Next, handle inline images:  ![alt text](url "optional title")
  536.     # Don't forget: encode * and _
  537.  
  538.     $text =~ s{
  539.         (               # wrap whole match in $1
  540.           !\[
  541.             (.*?)       # alt text = $2
  542.           \]
  543.           \(            # literal paren
  544.             [ \t]*
  545.             <?(\S+?)>?  # src url = $3
  546.             [ \t]*
  547.             (           # $4
  548.               (['"])    # quote char = $5
  549.               (.*?)     # title = $6
  550.               \5        # matching quote
  551.               [ \t]*
  552.             )?          # title is optional
  553.           \)
  554.         )
  555.     }{
  556.         my $result;
  557.         my $whole_match = $1;
  558.         my $alt_text    = $2;
  559.         my $url         = $3;
  560.         $alt_text =~ s/"/&quot;/g;
  561.         $url =~ s! \* !$g_escape_table{'*'}!gx;     # We've got to encode these to avoid
  562.         $url =~ s!  _ !$g_escape_table{'_'}!gx;     # conflicting with italics/bold.
  563.         $result = "\[img=\"$alt_text\"\]$url\[/img\]";
  564.         $result;
  565.     }xsge;
  566.  
  567.     return $text;
  568. }
  569.  
  570.  
  571. sub _DoHeaders {
  572.     my $text = shift;
  573.  
  574.     # Setext-style headers:
  575.     #     Header 1
  576.     #     ========
  577.     #  
  578.     #     Header 2
  579.     #     --------
  580.     #
  581.     $text =~ s{ ^(.+)[ \t]*\n=+[ \t]*\n+ }{
  582.         "[h]"  .  _RunSpanGamut($1)  .  "[/h]\n\n";
  583.     }egmx;
  584.  
  585.     $text =~ s{ ^(.+)[ \t]*\n-+[ \t]*\n+ }{
  586.         "[h]"  .  _RunSpanGamut($1)  .  "[/h]\n\n";
  587.     }egmx;
  588.  
  589.  
  590.     # atx-style headers:
  591.     #   # Header 1
  592.     #   ## Header 2
  593.     #   ## Header 2 with closing hashes ##
  594.     #   ...
  595.     #   ###### Header 6
  596.     #
  597.     $text =~ s{
  598.             ^(\#{1,6})  # $1 = string of #'s
  599.             [ \t]*
  600.             (.+?)       # $2 = Header text
  601.             [ \t]*
  602.             \#*         # optional closing #'s (not counted)
  603.             \n+
  604.         }{
  605.             "[h]"  .  _RunSpanGamut($2)  .  "[/h]\n\n";
  606.         }egmx;
  607.  
  608.     return $text;
  609. }
  610.  
  611.  
  612. sub _DoLists {
  613. #
  614. # Form HTML ordered (numbered) and unordered (bulleted) lists.
  615. #
  616.     my $text = shift;
  617.     my $less_than_tab = $g_tab_width - 1;
  618.  
  619.     # Re-usable patterns to match list item bullets and number markers:
  620.     my $marker_ul  = qr/[*+-]/;
  621.     my $marker_ol  = qr/\d+[.]/;
  622.     my $marker_any = qr/(?:$marker_ul|$marker_ol)/;
  623.  
  624.     # Re-usable pattern to match any entirel ul or ol list:
  625.     my $whole_list = qr{
  626.         (                               # $1 = whole list
  627.           (                             # $2
  628.             [ ]{0,$less_than_tab}
  629.             (${marker_any})             # $3 = first list item marker
  630.             [ \t]+
  631.           )
  632.           (?s:.+?)
  633.           (                             # $4
  634.               \z
  635.             |
  636.               \n{2,}
  637.               (?=\S)
  638.               (?!                       # Negative lookahead for another list item marker
  639.                 [ \t]*
  640.                 ${marker_any}[ \t]+
  641.               )
  642.           )
  643.         )
  644.     }mx;
  645.  
  646.     # We use a different prefix before nested lists than top-level lists.
  647.     # See extended comment in _ProcessListItems().
  648.     #
  649.     # Note: There's a bit of duplication here. My original implementation
  650.     # created a scalar regex pattern as the conditional result of the test on
  651.     # $g_list_level, and then only ran the $text =~ s{...}{...}egmx
  652.     # substitution once, using the scalar as the pattern. This worked,
  653.     # everywhere except when running under MT on my hosting account at Pair
  654.     # Networks. There, this caused all rebuilds to be killed by the reaper (or
  655.     # perhaps they crashed, but that seems incredibly unlikely given that the
  656.     # same script on the same server ran fine *except* under MT. I've spent
  657.     # more time trying to figure out why this is happening than I'd like to
  658.     # admit. My only guess, backed up by the fact that this workaround works,
  659.     # is that Perl optimizes the substition when it can figure out that the
  660.     # pattern will never change, and when this optimization isn't on, we run
  661.     # afoul of the reaper. Thus, the slightly redundant code to that uses two
  662.     # static s/// patterns rather than one conditional pattern.
  663.  
  664.     if ($g_list_level) {
  665.         $text =~ s{
  666.                 ^
  667.                 $whole_list
  668.             }{
  669.                 my $list = $1;
  670.                 my $list_type = ($3 =~ m/$marker_ul/) ? "ul" : "ol";
  671.                 # Turn double returns into triple returns, so that we can make a
  672.                 # paragraph for the last item in a list, if necessary:
  673.                 $list =~ s/\n{2,}/\n\n\n/g;
  674.                 my $result = _ProcessListItems($list, $marker_any);
  675.                 $result = "<$list_type>\n" . $result . "</$list_type>\n";
  676.                 $result;
  677.             }egmx;
  678.     }
  679.     else {
  680.         $text =~ s{
  681.                 (?:(?<=\n\n)|\A\n?)
  682.                 $whole_list
  683.             }{
  684.                 my $list = $1;
  685.                 my $list_type = ($3 =~ m/$marker_ul/) ? "list" : "list=1";
  686.                 # Turn double returns into triple returns, so that we can make a
  687.                 # paragraph for the last item in a list, if necessary:
  688.                 $list =~ s/\n{2,}/\n\n\n/g;
  689.                 my $result = _ProcessListItems($list, $marker_any);
  690.                 $result = "[$list_type]\n" . $result . "[/list]\n";
  691.                 $result;
  692.             }egmx;
  693.     }
  694.  
  695.  
  696.     return $text;
  697. }
  698.  
  699.  
  700. sub _ProcessListItems {
  701. #
  702. #   Process the contents of a single ordered or unordered list, splitting it
  703. #   into individual list items.
  704. #
  705.  
  706.     my $list_str = shift;
  707.     my $marker_any = shift;
  708.  
  709.  
  710.     # The $g_list_level global keeps track of when we're inside a list.
  711.     # Each time we enter a list, we increment it; when we leave a list,
  712.     # we decrement. If it's zero, we're not in a list anymore.
  713.     #
  714.     # We do this because when we're not inside a list, we want to treat
  715.     # something like this:
  716.     #
  717.     #       I recommend upgrading to version
  718.     #       8. Oops, now this line is treated
  719.     #       as a sub-list.
  720.     #
  721.     # As a single paragraph, despite the fact that the second line starts
  722.     # with a digit-period-space sequence.
  723.     #
  724.     # Whereas when we're inside a list (or sub-list), that line will be
  725.     # treated as the start of a sub-list. What a kludge, huh? This is
  726.     # an aspect of Markdown's syntax that's hard to parse perfectly
  727.     # without resorting to mind-reading. Perhaps the solution is to
  728.     # change the syntax rules such that sub-lists must start with a
  729.     # starting cardinal number; e.g. "1." or "a.".
  730.  
  731.     $g_list_level++;
  732.  
  733.     # trim trailing blank lines:
  734.     $list_str =~ s/\n{2,}\z/\n/;
  735.  
  736.  
  737.     $list_str =~ s{
  738.         (\n)?                           # leading line = $1
  739.         (^[ \t]*)                       # leading whitespace = $2
  740.         ($marker_any) [ \t]+            # list marker = $3
  741.         ((?s:.+?)                       # list item text   = $4
  742.         (\n{1,2}))
  743.         (?= \n* (\z | \2 ($marker_any) [ \t]+))
  744.     }{
  745.         my $item = $4;
  746.         my $leading_line = $1;
  747.         my $leading_space = $2;
  748.         if ($leading_line or ($item =~ m/\n{2,}/)) {
  749.             $item = _RunBlockGamut(_Outdent($item));
  750.         }
  751.         else {
  752.             # Recursion for sub-lists:
  753.             $item = _DoLists(_Outdent($item));
  754.             chomp $item;
  755.             $item = _RunSpanGamut($item);
  756.         }
  757.         "[\\*]" . $item . "[/\\*]\n";
  758.     }egmx;
  759.  
  760.     $g_list_level--;
  761.     return $list_str;
  762. }
  763.  
  764.  
  765.  
  766. sub _DoCodeBlocks {
  767. #
  768. #   Process Markdown `<pre><code>` blocks.
  769. #  
  770.  
  771.     my $text = shift;
  772.  
  773.     $text =~ s{
  774.             (?:\n\n|\A)
  775.             (               # $1 = the code block -- one or more lines, starting with a space/tab
  776.               (?:
  777.                 (?:[ ]{$g_tab_width} | \t)  # Lines must start with a tab or a tab-width of spaces
  778.                 .*\n+
  779.               )+
  780.             )
  781.             ((?=^[ ]{0,$g_tab_width}\S)|\Z) # Lookahead for non-space at line-start, or end of doc
  782.         }{
  783.             my $codeblock = $1;
  784.             my $result; # return value
  785.             $codeblock = _EncodeCode(_Outdent($codeblock));
  786.             $codeblock = _Detab($codeblock);
  787.             $codeblock =~ s/\A\n+//; # trim leading newlines
  788.             $codeblock =~ s/\s+\z//; # trim trailing whitespace
  789.             $result = "\n\n[code]" . $codeblock . "\n[/code]\n\n";
  790.             $result;
  791.         }egmx;
  792.  
  793.     return $text;
  794. }
  795.  
  796.  
  797. sub _DoCodeSpans {
  798. #
  799. #   *   Backtick quotes are used for <code></code> spans.
  800. #
  801. #   *   You can use multiple backticks as the delimiters if you want to
  802. #       include literal backticks in the code span. So, this input:
  803. #    
  804. #         Just type ``foo `bar` baz`` at the prompt.
  805. #    
  806. #       Will translate to:
  807. #    
  808. #         <p>Just type <code>foo `bar` baz</code> at the prompt.</p>
  809. #    
  810. #       There's no arbitrary limit to the number of backticks you
  811. #       can use as delimters. If you need three consecutive backticks
  812. #       in your code, use four for delimiters, etc.
  813. #
  814. #   *   You can use spaces to get literal backticks at the edges:
  815. #    
  816. #         ... type `` `bar` `` ...
  817. #    
  818. #       Turns to:
  819. #    
  820. #         ... type <code>`bar`</code> ...
  821. #
  822.  
  823.     my $text = shift;
  824.  
  825.     $text =~ s@
  826.             (`+)        # $1 = Opening run of `
  827.             (.+?)       # $2 = The code block
  828.             (?<!`)
  829.             \1          # Matching closer
  830.             (?!`)
  831.         @
  832.             my $c = "$2";
  833.             $c =~ s/^[ \t]*//g; # leading whitespace
  834.             $c =~ s/[ \t]*$//g; # trailing whitespace
  835.             $c = _EncodeCode($c);
  836.             "\[code\]$c\[/code\]";
  837.         @egsx;
  838.  
  839.     return $text;
  840. }
  841.  
  842.  
  843. sub _EncodeCode {
  844. #
  845. # Encode/escape certain characters inside Markdown code runs.
  846. # The point is that in code, these characters are literals,
  847. # and lose their special Markdown meanings.
  848. #
  849.     local $_ = shift;
  850.  
  851.     # Encode all ampersands; HTML entities are not
  852.     # entities within a Markdown code span.
  853.     s/&/&amp;/g;
  854.  
  855.     # Do the angle bracket song and dance:
  856.     s! <  !&lt;!gx;
  857.     s! >  !&gt;!gx;
  858.  
  859.     # Now, escape characters that are magic in Markdown:
  860.     s! \* !$g_escape_table{'*'}!gx;
  861.     s! _  !$g_escape_table{'_'}!gx;
  862.     s! {  !$g_escape_table{'{'}!gx;
  863.     s! }  !$g_escape_table{'}'}!gx;
  864.     s! \[ !$g_escape_table{'['}!gx;
  865.     s! \] !$g_escape_table{']'}!gx;
  866.     s! \\ !$g_escape_table{'\\'}!gx;
  867.  
  868.     return $_;
  869. }
  870.  
  871.  
  872. sub _DoItalicsAndBold {
  873.     my $text = shift;
  874.  
  875.     # <strong> must go first:
  876.     $text =~ s{ (\*\*|__) (?=\S) (.+?[*_]*) (?<=\S) \1 }
  877.         {\[b\]$2\[/b\]}gsx;
  878.  
  879.     $text =~ s{ (\*|_) (?=\S) (.+?) (?<=\S) \1 }
  880.         {\[i\]$2\[/i\]}gsx;
  881.  
  882.     return $text;
  883. }
  884.  
  885.  
  886. sub _DoBlockQuotes {
  887.     my $text = shift;
  888.  
  889.     $text =~ s{
  890.           (                             # Wrap whole match in $1
  891.             (
  892.               ^[ \t]*>[ \t]?            # '>' at the start of a line
  893.                 .+\n                    # rest of the first line
  894.               (.+\n)*                   # subsequent consecutive lines
  895.               \n*                       # blanks
  896.             )+
  897.           )
  898.         }{
  899.             my $bq = $1;
  900.             $bq =~ s/^[ \t]*>[ \t]?//gm;    # trim one level of quoting
  901.             $bq =~ s/^[ \t]+$//mg;          # trim whitespace-only lines
  902.             $bq = _RunBlockGamut($bq);      # recurse
  903.             $bq =~ s/^/  /g;
  904.             # These leading spaces screw with <pre> content, so we need to fix that:
  905.             $bq =~ s{
  906.                     (\s*<pre>.+?</pre>)
  907.                 }{
  908.                     my $pre = $1;
  909.                     $pre =~ s/^  //mg;
  910.                     $pre;
  911.                 }egsx;
  912.             "[quote]\n$bq\n[/quote]\n\n";
  913.         }egmx;
  914.  
  915.  
  916.     return $text;
  917. }
  918.  
  919.  
  920. sub _FormParagraphs {
  921. #
  922. #   Params:
  923. #       $text - string to process with html <p> tags
  924. #
  925.     my $text = shift;
  926.  
  927.     # Strip leading and trailing lines:
  928.     $text =~ s/\A\n+//;
  929.     $text =~ s/\n+\z//;
  930.  
  931.     my @grafs = split(/\n{2,}/, $text);
  932.  
  933.     #
  934.     # Wrap <p> tags.
  935.     #
  936.     foreach (@grafs) {
  937.         unless (defined( $g_html_blocks{$_} )) {
  938.             $_ = _RunSpanGamut($_);
  939.             s/^([ \t]*)//;
  940.             $_ .= "";
  941.         }
  942.     }
  943.  
  944.     #
  945.     # Unhashify HTML blocks
  946.     #
  947.     foreach (@grafs) {
  948.         if (defined( $g_html_blocks{$_} )) {
  949.             $_ = $g_html_blocks{$_};
  950.         }
  951.     }
  952.  
  953.     return join "\n\n", @grafs;
  954. }
  955.  
  956.  
  957. sub _EncodeAmpsAndAngles {
  958. # Smart processing for ampersands and angle brackets that need to be encoded.
  959.  
  960.     my $text = shift;
  961.  
  962.     # Ampersand-encoding based entirely on Nat Irons's Amputator MT plugin:
  963.     #   http://bumppo.net/projects/amputator/
  964.     $text =~ s/&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)/&amp;/g;
  965.  
  966.     # Encode naked <'s
  967.     $text =~ s{<(?![a-z/?\$!])}{&lt;}gi;
  968.  
  969.     return $text;
  970. }
  971.  
  972.  
  973. sub _EncodeBackslashEscapes {
  974. #
  975. #   Parameter:  String.
  976. #   Returns:    The string, with after processing the following backslash
  977. #               escape sequences.
  978. #
  979.     local $_ = shift;
  980.  
  981.     s! \\\\  !$g_escape_table{'\\'}!gx;     # Must process escaped backslashes first.
  982.     s! \\`   !$g_escape_table{'`'}!gx;
  983.    s! \\\*  !$g_escape_table{'*'}!gx;
  984.    s! \\_   !$g_escape_table{'_'}!gx;
  985.    s! \\\{  !$g_escape_table{'{'}!gx;
  986.    s! \\\}  !$g_escape_table{'}'}!gx;
  987.    s! \\\[  !$g_escape_table{'['}!gx;
  988.    s! \\\]  !$g_escape_table{']'}!gx;
  989.    s! \\\(  !$g_escape_table{'('}!gx;
  990.    s! \\\)  !$g_escape_table{')'}!gx;
  991.    s! \\>   !$g_escape_table{'>'}!gx;
  992.    s! \\\#  !$g_escape_table{'#'}!gx;
  993.     s! \\\+  !$g_escape_table{'+'}!gx;
  994.     s! \\\-  !$g_escape_table{'-'}!gx;
  995.     s! \\\.  !$g_escape_table{'.'}!gx;
  996.     s{ \\!  }{$g_escape_table{'!'}}gx;
  997.  
  998.     return $_;
  999. }
  1000.  
  1001.  
  1002. sub _DoAutoLinks {
  1003.     my $text = shift;
  1004.  
  1005.     $text =~ s{<((https?|ftp):[^'">\s]+)>}{<a href="$1">$1</a>}gi;
  1006.  
  1007.     # Email addresses: <address@domain.foo>
  1008.     $text =~ s{
  1009.         <
  1010.        (?:mailto:)?
  1011.         (
  1012.             [-.\w]+
  1013.             \@
  1014.             [-a-z0-9]+(\.[-a-z0-9]+)*\.[a-z]+
  1015.         )
  1016.         >
  1017.     }{
  1018.         _EncodeEmailAddress( _UnescapeSpecialChars($1) );
  1019.     }egix;
  1020.  
  1021.     return $text;
  1022. }
  1023.  
  1024.  
  1025. sub _EncodeEmailAddress {
  1026. #
  1027. #   Input: an email address, e.g. "foo@example.com"
  1028. #
  1029. #   Output: the email address as a mailto link, with each character
  1030. #       of the address encoded as either a decimal or hex entity, in
  1031. #       the hopes of foiling most address harvesting spam bots. E.g.:
  1032. #
  1033. #     <a href="&#x6D;&#97;&#105;&#108;&#x74;&#111;:&#102;&#111;&#111;&#64;&#101;
  1034. #       x&#x61;&#109;&#x70;&#108;&#x65;&#x2E;&#99;&#111;&#109;">&#102;&#111;&#111;
  1035. #       &#64;&#101;x&#x61;&#109;&#x70;&#108;&#x65;&#x2E;&#99;&#111;&#109;</a>
  1036. #
  1037. #   Based on a filter by Matthew Wickline, posted to the BBEdit-Talk
  1038. #   mailing list: <http://tinyurl.com/yu7ue>
  1039. #
  1040.  
  1041.     my $addr = shift;
  1042.  
  1043.     srand;
  1044.     my @encode = (
  1045.         sub { '&#' .                 ord(shift)   . ';' },
  1046.         sub { '&#x' . sprintf( "%X", ord(shift) ) . ';' },
  1047.         sub {                            shift          },
  1048.     );
  1049.  
  1050.     $addr = "mailto:" . $addr;
  1051.  
  1052.     $addr =~ s{(.)}{
  1053.         my $char = $1;
  1054.         if ( $char eq '@' ) {
  1055.             # this *must* be encoded. I insist.
  1056.             $char = $encode[int rand 1]->($char);
  1057.         } elsif ( $char ne ':' ) {
  1058.             # leave ':' alone (to spot mailto: later)
  1059.             my $r = rand;
  1060.             # roughly 10% raw, 45% hex, 45% dec
  1061.             $char = (
  1062.                 $r > .9   ?  $encode[2]->($char)  :
  1063.                 $r < .45  ?  $encode[1]->($char)  :
  1064.                              $encode[0]->($char)
  1065.             );
  1066.         }
  1067.         $char;
  1068.     }gex;
  1069.  
  1070.     $addr = qq{<a href="$addr">$addr</a>};
  1071.     $addr =~ s{">.+?:}{">}; # strip the mailto: from the visible part
  1072.  
  1073.     return $addr;
  1074. }
  1075.  
  1076.  
  1077. sub _UnescapeSpecialChars {
  1078. #
  1079. # Swap back in all the special characters we've hidden.
  1080. #
  1081.     my $text = shift;
  1082.  
  1083.     while( my($char, $hash) = each(%g_escape_table) ) {
  1084.         $text =~ s/$hash/$char/g;
  1085.     }
  1086.     return $text;
  1087. }
  1088.  
  1089.  
  1090. sub _TokenizeHTML {
  1091. #
  1092. #   Parameter:  String containing HTML markup.
  1093. #   Returns:    Reference to an array of the tokens comprising the input
  1094. #               string. Each token is either a tag (possibly with nested,
  1095. #               tags contained therein, such as <a href="<MTFoo>">, or a
  1096. #               run of text between tags. Each element of the array is a
  1097. #               two-element array; the first is either 'tag' or 'text';
  1098. #               the second is the actual value.
  1099. #
  1100. #
  1101. #   Derived from the _tokenize() subroutine from Brad Choate's MTRegex plugin.
  1102. #       <http://www.bradchoate.com/past/mtregex.php>
  1103. #
  1104.  
  1105.     my $str = shift;
  1106.     my $pos = 0;
  1107.     my $len = length $str;
  1108.     my @tokens;
  1109.  
  1110.     my $depth = 6;
  1111.     my $nested_tags = join('|', ('(?:<[a-z/!$](?:[^<>]') x $depth) . (')*>)' x  $depth);
  1112.     my $match = qr/(?s: <! ( -- .*? -- \s* )+ > ) |  # comment
  1113.                    (?s: <\? .*? \?> ) |              # processing instruction
  1114.                    $nested_tags/ix;                   # nested tags
  1115.  
  1116.     while ($str =~ m/($match)/g) {
  1117.         my $whole_tag = $1;
  1118.         my $sec_start = pos $str;
  1119.         my $tag_start = $sec_start - length $whole_tag;
  1120.         if ($pos < $tag_start) {
  1121.             push @tokens, ['text', substr($str, $pos, $tag_start - $pos)];
  1122.         }
  1123.         push @tokens, ['tag', $whole_tag];
  1124.         $pos = pos $str;
  1125.     }
  1126.     push @tokens, ['text', substr($str, $pos, $len - $pos)] if $pos < $len;
  1127.     \@tokens;
  1128. }
  1129.  
  1130.  
  1131. sub _Outdent {
  1132. #
  1133. # Remove one level of line-leading tabs or spaces
  1134. #
  1135.     my $text = shift;
  1136.  
  1137.     $text =~ s/^(\t|[ ]{1,$g_tab_width})//gm;
  1138.     return $text;
  1139. }
  1140.  
  1141.  
  1142. sub _Detab {
  1143. #
  1144. # Cribbed from a post by Bart Lateur:
  1145. # <http://www.nntp.perl.org/group/perl.macperl.anyperl/154>
  1146. #
  1147.     my $text = shift;
  1148.  
  1149.     $text =~ s{(.*?)\t}{$1.(' ' x ($g_tab_width - length($1) % $g_tab_width))}ge;
  1150.     return $text;
  1151. }
  1152.  
  1153.  
  1154. 1;
  1155.  
  1156. __END__
  1157. =pod
  1158. =head1 NAME
  1159. B<Markdown>
  1160. =head1 SYNOPSIS
  1161. B<Markdown.pl> [ B<--html4tags> ] [ B<--version> ] [ B<-shortversion> ]
  1162.     [ I<file> ... ]
  1163. =head1 DESCRIPTION
  1164. Markdown is a text-to-HTML filter; it translates an easy-to-read /
  1165. easy-to-write structured text format into HTML. Markdown's text format
  1166. is most similar to that of plain text email, and supports features such
  1167. as headers, *emphasis*, code blocks, blockquotes, and links.
  1168. Markdown's syntax is designed not as a generic markup language, but
  1169. specifically to serve as a front-end to (X)HTML. You can  use span-level
  1170. HTML tags anywhere in a Markdown document, and you can use block level
  1171. HTML tags (like <div> and <table> as well).
  1172. For more information about Markdown's syntax, see:
  1173.     http://daringfireball.net/projects/markdown/
  1174. =head1 OPTIONS
  1175. Use "--" to end switch parsing. For example, to open a file named "-z", use:
  1176.     Markdown.pl -- -z
  1177. =over 4
  1178. =item B<--html4tags>
  1179. Use HTML 4 style for empty element tags, e.g.:
  1180.     <br>
  1181. instead of Markdown's default XHTML style tags, e.g.:
  1182.     <br />
  1183. =item B<-v>, B<--version>
  1184. Display Markdown's version number and copyright information.
  1185. =item B<-s>, B<--shortversion>
  1186. Display the short-form version number.
  1187. =back
  1188. =head1 BUGS
  1189. To file bug reports or feature requests (other than topics listed in the
  1190. Caveats section above) please send email to:
  1191.     support@daringfireball.net
  1192. Please include with your report: (1) the example input; (2) the output
  1193. you expected; (3) the output Markdown actually produced.
  1194. =head1 VERSION HISTORY
  1195. See the readme file for detailed release notes for this version.
  1196. 1.0.1 - 14 Dec 2004
  1197. 1.0 - 28 Aug 2004
  1198. =head1 AUTHOR
  1199.     John Gruber
  1200.     http://daringfireball.net
  1201.     PHP port and other contributions by Michel Fortin
  1202.     http://michelf.com
  1203. =head1 COPYRIGHT AND LICENSE
  1204. Copyright (c) 2003-2004 John Gruber  
  1205. <http://daringfireball.net/>  
  1206. All rights reserved.
  1207. Redistribution and use in source and binary forms, with or without
  1208. modification, are permitted provided that the following conditions are
  1209. met:
  1210. * Redistributions of source code must retain the above copyright notice,
  1211.   this list of conditions and the following disclaimer.
  1212. * Redistributions in binary form must reproduce the above copyright
  1213.   notice, this list of conditions and the following disclaimer in the
  1214.   documentation and/or other materials provided with the distribution.
  1215. * Neither the name "Markdown" nor the names of its contributors may
  1216.   be used to endorse or promote products derived from this software
  1217.   without specific prior written permission.
  1218. This software is provided by the copyright holders and contributors "as
  1219. is" and any express or implied warranties, including, but not limited
  1220. to, the implied warranties of merchantability and fitness for a
  1221. particular purpose are disclaimed. In no event shall the copyright owner
  1222. or contributors be liable for any direct, indirect, incidental, special,
  1223. exemplary, or consequential damages (including, but not limited to,
  1224. procurement of substitute goods or services; loss of use, data, or
  1225. profits; or business interruption) however caused and on any theory of
  1226. liability, whether in contract, strict liability, or tort (including
  1227. negligence or otherwise) arising in any way out of the use of this
  1228. software, even if advised of the possibility of such damage.
  1229. =cut
RAW Paste Data

Adblocker detected! Please consider disabling it...

We've detected AdBlock Plus or some other adblocking software preventing Pastebin.com from fully loading.

We don't have any obnoxious sound, or popup ads, we actively block these annoying types of ads!

Please add Pastebin.com to your ad blocker whitelist or disable your adblocking software.

×