Advertisement
Guest User

md2bb.pl

a guest
Sep 17th, 2017
103
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Perl 31.81 KB | None | 0 0
  1. #!/usr/bin/perl
  2.  
  3. #
  4. # Markdown -- A text-to-HTML conversion tool for web writers
  5. #
  6. # Copyright (c) 2004 John Gruber
  7. # <http://daringfireball.net/projects/markdown/>
  8. #
  9.  
  10.  
  11. package Markdown2bbcode;
  12. require 5.006_000;
  13. use strict;
  14. use warnings;
  15.  
  16. use Digest::MD5 qw(md5_hex);
  17. use vars qw($VERSION);
  18. $VERSION = '1.0.1';
  19. # Tue 14 Dec 2004
  20.  
  21. ## Disabled; causes problems under Perl 5.6.1:
  22. # use utf8;
  23. # binmode( STDOUT, ":utf8" );  # c.f.: http://acis.openlib.org/dev/perl-unicode-struggle.html
  24.  
  25. #
  26. # Global default settings:
  27. #
  28. my $g_tab_width = 4;
  29.  
  30. #
  31. # Globals:
  32. #
  33.  
  34. # Regex to match balanced [brackets]. See Friedl's
  35. # "Mastering Regular Expressions", 2nd Ed., pp. 328-331.
  36. my $g_nested_brackets;
  37. $g_nested_brackets = qr{
  38.     (?>                                 # Atomic matching
  39.        [^\[\]]+                         # Anything other than brackets
  40.      |
  41.        \[
  42.          (??{ $g_nested_brackets })     # Recursive set of nested brackets
  43.        \]
  44.     )*
  45. }x;
  46.  
  47.  
  48. # Table of hash values for escaped characters:
  49. my %g_escape_table;
  50. foreach my $char (split //, '\\`*_{}[]()>#+-.!') {
  51.     $g_escape_table{$char} = md5_hex($char);
  52. }
  53.  
  54.  
  55. # Global hashes, used by various utility routines
  56. my %g_urls;
  57. my %g_titles;
  58. my %g_html_blocks;
  59.  
  60. # Used to track when we're inside an ordered or unordered list
  61. # (see _ProcessListItems() for details):
  62. my $g_list_level = 0;
  63.  
  64. sub start { 1; }
  65. sub story {
  66.     my($pkg, $path, $filename, $story_ref, $title_ref, $body_ref) = @_;
  67.  
  68.     if ( (defined($meta::markup) and ($meta::markup =~ /^\s*markdown\s*$/i))
  69.          ){
  70.             $$body_ref  = Markdown($$body_ref);
  71.      }
  72.      1;
  73. }
  74.  
  75.  
  76. #############################################################################
  77. no warnings 'once';
  78. use warnings;
  79.  
  80. #### Check for command-line switches: #################
  81. my %cli_opts;
  82. use Getopt::Long;
  83. Getopt::Long::Configure('pass_through');
  84. GetOptions(\%cli_opts,
  85.     'version',
  86.     'shortversion',
  87.     'html4tags',
  88. );
  89. if ($cli_opts{'version'}) {     # Version info
  90.     print "\nThis is Markdown, version $VERSION.\n";
  91.     print "Copyright 2004 John Gruber\n";
  92.     print "http://daringfireball.net/projects/markdown/\n\n";
  93.     exit 0;
  94. }
  95. if ($cli_opts{'shortversion'}) {        # Just the version number string.
  96.     print $VERSION;
  97.     exit 0;
  98. }
  99.  
  100. #### Process incoming text: ###########################
  101. my $text;
  102. {
  103.     local $/;               # Slurp the whole file
  104.     $text = <>;
  105. }
  106.  
  107. my @lines = split /^/, (Markdown($text));
  108. foreach my $line (@lines) {
  109.     my $win = qx#xdotool getwindowfocus#;
  110.     chomp $win;
  111.     chomp $line;
  112.  
  113.     my $break = join(' ', 'xdotool', 'windowactivate', '--sync', $win, 'key', 'Return');
  114.     if ($line eq "\n") {system $break;
  115.     }
  116.     elsif (substr($line, 0, 2) eq "--") {my $line = join('', '[code]', $line, '[/code]');
  117.     my $cmd = join(' ', 'xdotool', 'type', '--delay', '0', '--window', $win, "$line", "\n" );
  118.     system $cmd;
  119.     }
  120.     else {
  121.     my $cmd = join(' ', 'xdotool', 'type', '--delay', '0', '--window', $win, "$line", "\n" );
  122.     system $cmd;
  123.     }
  124.     system $break;
  125. }
  126. ##############################################################################
  127.  
  128.  
  129. sub Markdown {
  130. #
  131. # Main function. The order in which other subs are called here is
  132. # essential. Link and image substitutions need to happen before
  133. # _EscapeSpecialChars(), so that any *'s or _'s in the <a>
  134. # and <img> tags get encoded.
  135. #
  136.     my $text = shift;
  137.  
  138.     # Clear the global hashes. If we don't clear these, you get conflicts
  139.     # from other articles when generating a page which contains more than
  140.     # one article (e.g. an index page that shows the N most recent
  141.     # articles):
  142.     %g_urls = ();
  143.     %g_titles = ();
  144.     %g_html_blocks = ();
  145.  
  146.  
  147.     # Standardize line endings:
  148.     $text =~ s{\r\n}{\n}g;  # DOS to Unix
  149.     $text =~ s{\r}{\n}g;    # Mac to Unix
  150.  
  151.     # Make sure $text ends with a couple of newlines:
  152.     $text .= "\n\n";
  153.  
  154.     # Convert all tabs to spaces.
  155.     $text = _Detab($text);
  156.  
  157.     # Strip any lines consisting only of spaces and tabs.
  158.     # This makes subsequent regexen easier to write, because we can
  159.     # match consecutive blank lines with /\n+/ instead of something
  160.     # contorted like /[ \t]*\n+/ .
  161.     $text =~ s/^[ \t]+$//mg;
  162.  
  163.     # Turn block-level HTML blocks into hash entries
  164.     $text = _HashHTMLBlocks($text);
  165.  
  166.     # Strip link definitions, store in hashes.
  167.     $text = _StripLinkDefinitions($text);
  168.  
  169.     $text = _RunBlockGamut($text);
  170.  
  171.     $text = _UnescapeSpecialChars($text);
  172.  
  173.     return $text . "\n";
  174. }
  175.  
  176.  
  177. sub _StripLinkDefinitions {
  178. #
  179. # Strips link definitions from text, stores the URLs and titles in
  180. # hash references.
  181. #
  182.     my $text = shift;
  183.     my $less_than_tab = $g_tab_width - 1;
  184.  
  185.     # Link defs are in the form: ^[id]: url "optional title"
  186.     while ($text =~ s{
  187.                         ^[ ]{0,$less_than_tab}\[(.+)\]: # id = $1
  188.                           [ \t]*
  189.                           \n?               # maybe *one* newline
  190.                           [ \t]*
  191.                         <?(\S+?)>?          # url = $2
  192.                           [ \t]*
  193.                           \n?               # maybe one newline
  194.                           [ \t]*
  195.                         (?:
  196.                             (?<=\s)         # lookbehind for whitespace
  197.                             ["(]
  198.                             (.+?)           # title = $3
  199.                             [")]
  200.                             [ \t]*
  201.                         )?  # title is optional
  202.                         (?:\n+|\Z)
  203.                     }
  204.                     {}mx) {
  205.         $g_urls{lc $1} = _EncodeAmpsAndAngles( $2 );    # Link IDs are case-insensitive
  206.         if ($3) {
  207.             $g_titles{lc $1} = $3;
  208.             $g_titles{lc $1} =~ s/"/&quot;/g;
  209.         }
  210.     }
  211.  
  212.     return $text;
  213. }
  214.  
  215.  
  216. sub _HashHTMLBlocks {
  217.     my $text = shift;
  218.     my $less_than_tab = $g_tab_width - 1;
  219.  
  220.     # Hashify HTML blocks:
  221.     # We only want to do this for block-level HTML tags, such as headers,
  222.     # lists, and tables. That's because we still want to wrap <p>s around
  223.     # "paragraphs" that are wrapped in non-block-level tags, such as anchors,
  224.     # phrase emphasis, and spans. The list of tags we're looking for is
  225.     # hard-coded:
  226.     my $block_tags_a = qr/p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math|ins|del/;
  227.     my $block_tags_b = qr/p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math/;
  228.  
  229.     # First, look for nested blocks, e.g.:
  230.     #   <div>
  231.     #       <div>
  232.     #       tags for inner block must be indented.
  233.     #       </div>
  234.     #   </div>
  235.     #
  236.     # The outermost tags must start at the left margin for this to match, and
  237.     # the inner nested divs must be indented.
  238.     # We need to do this before the next, more liberal match, because the next
  239.     # match will start at the first `<div>` and stop at the first `</div>`.
  240.     $text =~ s{
  241.                 (                       # save in $1
  242.                     ^                   # start of line  (with /m)
  243.                     <($block_tags_a)    # start tag = $2
  244.                     \b                  # word break
  245.                     (.*\n)*?            # any number of lines, minimally matching
  246.                     </\2>               # the matching end tag
  247.                     [ \t]*              # trailing spaces/tabs
  248.                     (?=\n+|\Z)  # followed by a newline or end of document
  249.                 )
  250.             }{
  251.                 my $key = md5_hex($1);
  252.                 $g_html_blocks{$key} = $1;
  253.                 "\n\n" . $key . "\n\n";
  254.             }egmx;
  255.  
  256.  
  257.     #
  258.     # Now match more liberally, simply from `\n<tag>` to `</tag>\n`
  259.     #
  260.     $text =~ s{
  261.                 (                       # save in $1
  262.                     ^                   # start of line  (with /m)
  263.                     <($block_tags_b)    # start tag = $2
  264.                     \b                  # word break
  265.                     (.*\n)*?            # any number of lines, minimally matching
  266.                     .*</\2>             # the matching end tag
  267.                     [ \t]*              # trailing spaces/tabs
  268.                     (?=\n+|\Z)  # followed by a newline or end of document
  269.                 )
  270.             }{
  271.                 my $key = md5_hex($1);
  272.                 $g_html_blocks{$key} = $1;
  273.                 "\n\n" . $key . "\n\n";
  274.             }egmx;
  275.     # Special case just for <hr />. It was easier to make a special case than
  276.     # to make the other regex more complicated.
  277.     $text =~ s{
  278.                 (?:
  279.                     (?<=\n\n)       # Starting after a blank line
  280.                     |               # or
  281.                     \A\n?           # the beginning of the doc
  282.                 )
  283.                 (                       # save in $1
  284.                     [ ]{0,$less_than_tab}
  285.                     <(hr)               # start tag = $2
  286.                     \b                  # word break
  287.                     ([^<>])*?           #
  288.                     /?>                 # the matching end tag
  289.                     [ \t]*
  290.                     (?=\n{2,}|\Z)       # followed by a blank line or end of document
  291.                 )
  292.             }{
  293.                 my $key = md5_hex($1);
  294.                 $g_html_blocks{$key} = $1;
  295.                 "\n\n" . $key . "\n\n";
  296.             }egx;
  297.  
  298.     # Special case for standalone HTML comments:
  299.     $text =~ s{
  300.                 (?:
  301.                     (?<=\n\n)       # Starting after a blank line
  302.                     |               # or
  303.                     \A\n?           # the beginning of the doc
  304.                 )
  305.                 (                       # save in $1
  306.                     [ ]{0,$less_than_tab}
  307.                     (?s:
  308.                         <!
  309.                         (--.*?--\s*)+
  310.                         >
  311.                     )
  312.                     [ \t]*
  313.                     (?=\n{2,}|\Z)       # followed by a blank line or end of document
  314.                 )
  315.             }{
  316.                 my $key = md5_hex($1);
  317.                 $g_html_blocks{$key} = $1;
  318.                 "\n\n" . $key . "\n\n";
  319.             }egx;
  320.  
  321.  
  322.     return $text;
  323. }
  324.  
  325.  
  326. sub _RunBlockGamut {
  327. #
  328. # These are all the transformations that form block-level
  329. # tags like paragraphs, headers, and list items.
  330. #
  331.     my $text = shift;
  332.  
  333.     $text = _DoHeaders($text);
  334.  
  335.     $text = _DoLists($text);
  336.  
  337.     $text = _DoCodeBlocks($text);
  338.  
  339.     $text = _DoBlockQuotes($text);
  340.  
  341.     # We already ran _HashHTMLBlocks() before, in Markdown(), but that
  342.     # was to escape raw HTML in the original Markdown source. This time,
  343.     # we're escaping the markup we've just created, so that we don't wrap
  344.     # <p> tags around block-level tags.
  345.     $text = _HashHTMLBlocks($text);
  346.  
  347.     $text = _FormParagraphs($text);
  348.  
  349.     return $text;
  350. }
  351.  
  352.  
  353. sub _RunSpanGamut {
  354. #
  355. # These are all the transformations that occur *within* block-level
  356. # tags like paragraphs, headers, and list items.
  357. #
  358.     my $text = shift;
  359.  
  360.     $text = _DoCodeSpans($text);
  361.  
  362.     $text = _EscapeSpecialChars($text);
  363.  
  364.     # Process anchor and image tags. Images must come first,
  365.     # because ![foo][f] looks like an anchor.
  366.     $text = _DoImages($text);
  367.     $text = _DoAnchors($text);
  368.  
  369.     # Make links out of things like `<http://example.com/>`
  370.     # Must come after _DoAnchors(), because you can use < and >
  371.     # delimiters in inline links like [this](<url>).
  372.     $text = _DoAutoLinks($text);
  373.  
  374.     $text = _EncodeAmpsAndAngles($text);
  375.  
  376.     $text = _DoItalicsAndBold($text);
  377.  
  378.     # Do hard breaks:
  379.     $text =~ s/ {2,}\n/ \n/g;
  380.  
  381.     return $text;
  382. }
  383.  
  384.  
  385. sub _EscapeSpecialChars {
  386.     my $text = shift;
  387.     my $tokens ||= _TokenizeHTML($text);
  388.  
  389.     $text = '';   # rebuild $text from the tokens
  390. #   my $in_pre = 0;  # Keep track of when we're inside <pre> or <code> tags.
  391. #   my $tags_to_skip = qr!<(/?)(?:pre|code|kbd|script|math)[\s>]!;
  392.  
  393.     foreach my $cur_token (@$tokens) {
  394.         if ($cur_token->[0] eq "tag") {
  395.             # Within tags, encode * and _ so they don't conflict
  396.             # with their use in Markdown for italics and strong.
  397.             # We're replacing each such character with its
  398.             # corresponding MD5 checksum value; this is likely
  399.             # overkill, but it should prevent us from colliding
  400.             # with the escape values by accident.
  401.             $cur_token->[1] =~  s! \* !$g_escape_table{'*'}!gx;
  402.             $cur_token->[1] =~  s! _  !$g_escape_table{'_'}!gx;
  403.             $text .= $cur_token->[1];
  404.         } else {
  405.             my $t = $cur_token->[1];
  406.             $t = _EncodeBackslashEscapes($t);
  407.             $text .= $t;
  408.         }
  409.     }
  410.     return $text;
  411. }
  412.  
  413.  
  414. sub _DoAnchors {
  415. #
  416. # Turn Markdown link shortcuts into XHTML <a> tags.
  417. #
  418.     my $text = shift;
  419.  
  420.     #
  421.     # First, handle reference-style links: [link text] [id]
  422.     #
  423.     $text =~ s{
  424.         (                   # wrap whole match in $1
  425.           \[
  426.             ($g_nested_brackets)    # link text = $2
  427.           \]
  428.           [ ]?              # one optional space
  429.           (?:\n[ ]*)?       # one optional newline followed by spaces
  430.           \[
  431.             (.*?)       # id = $3
  432.           \]
  433.         )
  434.     }{
  435.         my $result;
  436.         my $whole_match = $1;
  437.         my $link_text   = $2;
  438.         my $link_id     = lc $3;
  439.         if ($link_id eq "") {
  440.             $link_id = lc $link_text;     # for shortcut links like [this][].
  441.         }
  442.         if (defined $g_urls{$link_id}) {
  443.             my $url = $g_urls{$link_id};
  444.             $url =~ s! \* !$g_escape_table{'*'}!gx;     # We've got to encode these to avoid
  445.             $url =~ s!  _ !$g_escape_table{'_'}!gx;     # conflicting with italics/bold.
  446.             $result = "\[url=\"$url\"\]$link_text\[/url\]";
  447.         }
  448.         else {
  449.             $result = $whole_match;
  450.         }
  451.         $result;
  452.     }xsge;
  453.  
  454.     #
  455.     # Next, inline-style links: [link text](url "optional title")
  456.     #
  457.     $text =~ s{
  458.         (               # wrap whole match in $1
  459.           \[
  460.             ($g_nested_brackets)    # link text = $2
  461.           \]
  462.           \(            # literal paren
  463.             [ \t]*
  464.             <?(.*?)>?   # href = $3
  465.             [ \t]*
  466.             (           # $4
  467.               (['"])    # quote char = $5
  468.               (.*?)     # Title = $6
  469.               \5        # matching quote
  470.             )?          # title is optional
  471.           \)
  472.         )
  473.     }{
  474.         my $result;
  475.         my $whole_match = $1;
  476.         my $link_text   = $2;
  477.         my $url         = $3;
  478.         $url =~ s! \* !$g_escape_table{'*'}!gx;     # We've got to encode these to avoid
  479.         $url =~ s!  _ !$g_escape_table{'_'}!gx;     # conflicting with italics/bold.
  480.         $result = "\[url=\"$url\"\]$link_text\[/url\]";
  481.         $result;
  482.     }xsge;
  483.  
  484.     return $text;
  485. }
  486.  
  487.  
  488. sub _DoImages {
  489. #
  490. # Turn Markdown image shortcuts into <img> tags.
  491. #
  492.     my $text = shift;
  493.  
  494.     #
  495.     # First, handle reference-style labeled images: ![alt text][id]
  496.     #
  497.     $text =~ s{
  498.         (               # wrap whole match in $1
  499.           !\[
  500.             (.*?)       # alt text = $2
  501.           \]
  502.           [ ]?              # one optional space
  503.           (?:\n[ ]*)?       # one optional newline followed by spaces
  504.           \[
  505.             (.*?)       # id = $3
  506.           \]
  507.         )
  508.     }{
  509.         my $result;
  510.         my $whole_match = $1;
  511.         my $alt_text    = $2;
  512.         my $link_id     = lc $3;
  513.         if ($link_id eq "") {
  514.             $link_id = lc $alt_text;     # for shortcut links like ![this][].
  515.         }
  516.         $alt_text =~ s/"/&quot;/g;
  517.         if (defined $g_urls{$link_id}) {
  518.             my $url = $g_urls{$link_id};
  519.             $url =~ s! \* !$g_escape_table{'*'}!gx;     # We've got to encode these to avoid
  520.             $url =~ s!  _ !$g_escape_table{'_'}!gx;     # conflicting with italics/bold.
  521.             $result = "\[img=\"$alt_text\"\]$url\[/img\]";
  522.         }
  523.         else {
  524.             # If there's no such link ID, leave intact:
  525.             $result = $whole_match;
  526.         }
  527.         $result;
  528.     }xsge;
  529.  
  530.     #
  531.     # Next, handle inline images:  ![alt text](url "optional title")
  532.     # Don't forget: encode * and _
  533.  
  534.     $text =~ s{
  535.         (               # wrap whole match in $1
  536.           !\[
  537.             (.*?)       # alt text = $2
  538.           \]
  539.           \(            # literal paren
  540.             [ \t]*
  541.             <?(\S+?)>?  # src url = $3
  542.             [ \t]*
  543.             (           # $4
  544.               (['"])    # quote char = $5
  545.               (.*?)     # title = $6
  546.               \5        # matching quote
  547.               [ \t]*
  548.             )?          # title is optional
  549.           \)
  550.         )
  551.     }{
  552.         my $result;
  553.         my $whole_match = $1;
  554.         my $alt_text    = $2;
  555.         my $url         = $3;
  556.         $alt_text =~ s/"/&quot;/g;
  557.         $url =~ s! \* !$g_escape_table{'*'}!gx;     # We've got to encode these to avoid
  558.         $url =~ s!  _ !$g_escape_table{'_'}!gx;     # conflicting with italics/bold.
  559.         $result = "\[img=\"$alt_text\"\]$url\[/img\]";
  560.         $result;
  561.     }xsge;
  562.  
  563.     return $text;
  564. }
  565.  
  566.  
  567. sub _DoHeaders {
  568.     my $text = shift;
  569.  
  570.     # Setext-style headers:
  571.     #     Header 1
  572.     #     ========
  573.     #  
  574.     #     Header 2
  575.     #     --------
  576.     #
  577.     $text =~ s{ ^(.+)[ \t]*\n=+[ \t]*\n+ }{
  578.         "[h]"  .  _RunSpanGamut($1)  .  "[/h]\n\n";
  579.     }egmx;
  580.  
  581.     $text =~ s{ ^(.+)[ \t]*\n-+[ \t]*\n+ }{
  582.         "[h]"  .  _RunSpanGamut($1)  .  "[/h]\n\n";
  583.     }egmx;
  584.  
  585.  
  586.     # atx-style headers:
  587.     #   # Header 1
  588.     #   ## Header 2
  589.     #   ## Header 2 with closing hashes ##
  590.     #   ...
  591.     #   ###### Header 6
  592.     #
  593.     $text =~ s{
  594.             ^(\#{1,6})  # $1 = string of #'s
  595.             [ \t]*
  596.             (.+?)       # $2 = Header text
  597.             [ \t]*
  598.             \#*         # optional closing #'s (not counted)
  599.             \n+
  600.         }{
  601.             "[h]"  .  _RunSpanGamut($2)  .  "[/h]\n\n";
  602.         }egmx;
  603.  
  604.     return $text;
  605. }
  606.  
  607.  
  608. sub _DoLists {
  609. #
  610. # Form HTML ordered (numbered) and unordered (bulleted) lists.
  611. #
  612.     my $text = shift;
  613.     my $less_than_tab = $g_tab_width - 1;
  614.  
  615.     # Re-usable patterns to match list item bullets and number markers:
  616.     my $marker_ul  = qr/[*+-]/;
  617.     my $marker_ol  = qr/\d+[.]/;
  618.     my $marker_any = qr/(?:$marker_ul|$marker_ol)/;
  619.  
  620.     # Re-usable pattern to match any entirel ul or ol list:
  621.     my $whole_list = qr{
  622.         (                               # $1 = whole list
  623.           (                             # $2
  624.             [ ]{0,$less_than_tab}
  625.             (${marker_any})             # $3 = first list item marker
  626.             [ \t]+
  627.           )
  628.           (?s:.+?)
  629.           (                             # $4
  630.               \z
  631.             |
  632.               \n{2,}
  633.               (?=\S)
  634.               (?!                       # Negative lookahead for another list item marker
  635.                 [ \t]*
  636.                 ${marker_any}[ \t]+
  637.               )
  638.           )
  639.         )
  640.     }mx;
  641.  
  642.     # We use a different prefix before nested lists than top-level lists.
  643.     # See extended comment in _ProcessListItems().
  644.     #
  645.     # Note: There's a bit of duplication here. My original implementation
  646.     # created a scalar regex pattern as the conditional result of the test on
  647.     # $g_list_level, and then only ran the $text =~ s{...}{...}egmx
  648.     # substitution once, using the scalar as the pattern. This worked,
  649.     # everywhere except when running under MT on my hosting account at Pair
  650.     # Networks. There, this caused all rebuilds to be killed by the reaper (or
  651.     # perhaps they crashed, but that seems incredibly unlikely given that the
  652.     # same script on the same server ran fine *except* under MT. I've spent
  653.     # more time trying to figure out why this is happening than I'd like to
  654.     # admit. My only guess, backed up by the fact that this workaround works,
  655.     # is that Perl optimizes the substition when it can figure out that the
  656.     # pattern will never change, and when this optimization isn't on, we run
  657.     # afoul of the reaper. Thus, the slightly redundant code to that uses two
  658.     # static s/// patterns rather than one conditional pattern.
  659.  
  660.     if ($g_list_level) {
  661.         $text =~ s{
  662.                 ^
  663.                 $whole_list
  664.             }{
  665.                 my $list = $1;
  666.                 my $list_type = ($3 =~ m/$marker_ul/) ? "ul" : "ol";
  667.                 # Turn double returns into triple returns, so that we can make a
  668.                 # paragraph for the last item in a list, if necessary:
  669.                 $list =~ s/\n{2,}/\n\n\n/g;
  670.                 my $result = _ProcessListItems($list, $marker_any);
  671.                 $result = "<$list_type>\n" . $result . "</$list_type>\n";
  672.                 $result;
  673.             }egmx;
  674.     }
  675.     else {
  676.         $text =~ s{
  677.                 (?:(?<=\n\n)|\A\n?)
  678.                 $whole_list
  679.             }{
  680.                 my $list = $1;
  681.                 my $list_type = ($3 =~ m/$marker_ul/) ? "list" : "list=1";
  682.                 # Turn double returns into triple returns, so that we can make a
  683.                 # paragraph for the last item in a list, if necessary:
  684.                 $list =~ s/\n{2,}/\n\n\n/g;
  685.                 my $result = _ProcessListItems($list, $marker_any);
  686.                 $result = "[$list_type]\n" . $result . "[/list]\n";
  687.                 $result;
  688.             }egmx;
  689.     }
  690.  
  691.  
  692.     return $text;
  693. }
  694.  
  695.  
  696. sub _ProcessListItems {
  697. #
  698. #   Process the contents of a single ordered or unordered list, splitting it
  699. #   into individual list items.
  700. #
  701.  
  702.     my $list_str = shift;
  703.     my $marker_any = shift;
  704.  
  705.  
  706.     # The $g_list_level global keeps track of when we're inside a list.
  707.     # Each time we enter a list, we increment it; when we leave a list,
  708.     # we decrement. If it's zero, we're not in a list anymore.
  709.     #
  710.     # We do this because when we're not inside a list, we want to treat
  711.     # something like this:
  712.     #
  713.     #       I recommend upgrading to version
  714.     #       8. Oops, now this line is treated
  715.     #       as a sub-list.
  716.     #
  717.     # As a single paragraph, despite the fact that the second line starts
  718.     # with a digit-period-space sequence.
  719.     #
  720.     # Whereas when we're inside a list (or sub-list), that line will be
  721.     # treated as the start of a sub-list. What a kludge, huh? This is
  722.     # an aspect of Markdown's syntax that's hard to parse perfectly
  723.     # without resorting to mind-reading. Perhaps the solution is to
  724.     # change the syntax rules such that sub-lists must start with a
  725.     # starting cardinal number; e.g. "1." or "a.".
  726.  
  727.     $g_list_level++;
  728.  
  729.     # trim trailing blank lines:
  730.     $list_str =~ s/\n{2,}\z/\n/;
  731.  
  732.  
  733.     $list_str =~ s{
  734.         (\n)?                           # leading line = $1
  735.         (^[ \t]*)                       # leading whitespace = $2
  736.         ($marker_any) [ \t]+            # list marker = $3
  737.         ((?s:.+?)                       # list item text   = $4
  738.         (\n{1,2}))
  739.         (?= \n* (\z | \2 ($marker_any) [ \t]+))
  740.     }{
  741.         my $item = $4;
  742.         my $leading_line = $1;
  743.         my $leading_space = $2;
  744.         if ($leading_line or ($item =~ m/\n{2,}/)) {
  745.             $item = _RunBlockGamut(_Outdent($item));
  746.         }
  747.         else {
  748.             # Recursion for sub-lists:
  749.             $item = _DoLists(_Outdent($item));
  750.             chomp $item;
  751.             $item = _RunSpanGamut($item);
  752.         }
  753.         "[\\*]" . $item . "[/\\*]\n";
  754.     }egmx;
  755.  
  756.     $g_list_level--;
  757.     return $list_str;
  758. }
  759.  
  760.  
  761.  
  762. sub _DoCodeBlocks {
  763. #
  764. #   Process Markdown `<pre><code>` blocks.
  765. #  
  766.  
  767.     my $text = shift;
  768.  
  769.     $text =~ s{
  770.             (?:\n\n|\A)
  771.             (               # $1 = the code block -- one or more lines, starting with a space/tab
  772.               (?:
  773.                 (?:[ ]{$g_tab_width} | \t)  # Lines must start with a tab or a tab-width of spaces
  774.                 .*\n+
  775.               )+
  776.             )
  777.             ((?=^[ ]{0,$g_tab_width}\S)|\Z) # Lookahead for non-space at line-start, or end of doc
  778.         }{
  779.             my $codeblock = $1;
  780.             my $result; # return value
  781.             $codeblock = _EncodeCode(_Outdent($codeblock));
  782.             $codeblock = _Detab($codeblock);
  783.             $codeblock =~ s/\A\n+//; # trim leading newlines
  784.             $codeblock =~ s/\s+\z//; # trim trailing whitespace
  785.             $result = "\n\n[code]" . $codeblock . "\n[/code]\n\n";
  786.             $result;
  787.         }egmx;
  788.  
  789.     return $text;
  790. }
  791.  
  792.  
  793. sub _DoCodeSpans {
  794. #
  795. #   *   Backtick quotes are used for <code></code> spans.
  796. #
  797. #   *   You can use multiple backticks as the delimiters if you want to
  798. #       include literal backticks in the code span. So, this input:
  799. #    
  800. #         Just type ``foo `bar` baz`` at the prompt.
  801. #    
  802. #       Will translate to:
  803. #    
  804. #         <p>Just type <code>foo `bar` baz</code> at the prompt.</p>
  805. #    
  806. #       There's no arbitrary limit to the number of backticks you
  807. #       can use as delimters. If you need three consecutive backticks
  808. #       in your code, use four for delimiters, etc.
  809. #
  810. #   *   You can use spaces to get literal backticks at the edges:
  811. #    
  812. #         ... type `` `bar` `` ...
  813. #    
  814. #       Turns to:
  815. #    
  816. #         ... type <code>`bar`</code> ...
  817. #
  818.  
  819.     my $text = shift;
  820.  
  821.     $text =~ s@
  822.             (`+)        # $1 = Opening run of `
  823.             (.+?)       # $2 = The code block
  824.             (?<!`)
  825.             \1          # Matching closer
  826.             (?!`)
  827.         @
  828.             my $c = "$2";
  829.             $c =~ s/^[ \t]*//g; # leading whitespace
  830.             $c =~ s/[ \t]*$//g; # trailing whitespace
  831.             $c = _EncodeCode($c);
  832.             "\[code\]$c\[/code\]";
  833.         @egsx;
  834.  
  835.     return $text;
  836. }
  837.  
  838.  
  839. sub _EncodeCode {
  840. #
  841. # Encode/escape certain characters inside Markdown code runs.
  842. # The point is that in code, these characters are literals,
  843. # and lose their special Markdown meanings.
  844. #
  845.     local $_ = shift;
  846.  
  847.     # Encode all ampersands; HTML entities are not
  848.     # entities within a Markdown code span.
  849.     s/&/&amp;/g;
  850.  
  851.     # Do the angle bracket song and dance:
  852.     s! <  !&lt;!gx;
  853.     s! >  !&gt;!gx;
  854.  
  855.     # Now, escape characters that are magic in Markdown:
  856.     s! \* !$g_escape_table{'*'}!gx;
  857.     s! _  !$g_escape_table{'_'}!gx;
  858.     s! {  !$g_escape_table{'{'}!gx;
  859.     s! }  !$g_escape_table{'}'}!gx;
  860.     s! \[ !$g_escape_table{'['}!gx;
  861.     s! \] !$g_escape_table{']'}!gx;
  862.     s! \\ !$g_escape_table{'\\'}!gx;
  863.  
  864.     return $_;
  865. }
  866.  
  867.  
  868. sub _DoItalicsAndBold {
  869.     my $text = shift;
  870.  
  871.     # <strong> must go first:
  872.     $text =~ s{ (\*\*|__) (?=\S) (.+?[*_]*) (?<=\S) \1 }
  873.         {\[b\]$2\[/b\]}gsx;
  874.  
  875.     $text =~ s{ (\*|_) (?=\S) (.+?) (?<=\S) \1 }
  876.         {\[i\]$2\[/i\]}gsx;
  877.  
  878.     return $text;
  879. }
  880.  
  881.  
  882. sub _DoBlockQuotes {
  883.     my $text = shift;
  884.  
  885.     $text =~ s{
  886.           (                             # Wrap whole match in $1
  887.             (
  888.               ^[ \t]*>[ \t]?            # '>' at the start of a line
  889.                 .+\n                    # rest of the first line
  890.               (.+\n)*                   # subsequent consecutive lines
  891.               \n*                       # blanks
  892.             )+
  893.           )
  894.         }{
  895.             my $bq = $1;
  896.             $bq =~ s/^[ \t]*>[ \t]?//gm;    # trim one level of quoting
  897.             $bq =~ s/^[ \t]+$//mg;          # trim whitespace-only lines
  898.             $bq = _RunBlockGamut($bq);      # recurse
  899.             $bq =~ s/^/  /g;
  900.             # These leading spaces screw with <pre> content, so we need to fix that:
  901.             $bq =~ s{
  902.                     (\s*<pre>.+?</pre>)
  903.                 }{
  904.                     my $pre = $1;
  905.                     $pre =~ s/^  //mg;
  906.                     $pre;
  907.                 }egsx;
  908.             "[quote]\n$bq\n[/quote]\n\n";
  909.         }egmx;
  910.  
  911.  
  912.     return $text;
  913. }
  914.  
  915.  
  916. sub _FormParagraphs {
  917. #
  918. #   Params:
  919. #       $text - string to process with html <p> tags
  920. #
  921.     my $text = shift;
  922.  
  923.     # Strip leading and trailing lines:
  924.     $text =~ s/\A\n+//;
  925.     $text =~ s/\n+\z//;
  926.  
  927.     my @grafs = split(/\n{2,}/, $text);
  928.  
  929.     #
  930.     # Wrap <p> tags.
  931.     #
  932.     foreach (@grafs) {
  933.         unless (defined( $g_html_blocks{$_} )) {
  934.             $_ = _RunSpanGamut($_);
  935.             s/^([ \t]*)//;
  936.             $_ .= "";
  937.         }
  938.     }
  939.  
  940.     #
  941.     # Unhashify HTML blocks
  942.     #
  943.     foreach (@grafs) {
  944.         if (defined( $g_html_blocks{$_} )) {
  945.             $_ = $g_html_blocks{$_};
  946.         }
  947.     }
  948.  
  949.     return join "\n\n", @grafs;
  950. }
  951.  
  952.  
  953. sub _EncodeAmpsAndAngles {
  954. # Smart processing for ampersands and angle brackets that need to be encoded.
  955.  
  956.     my $text = shift;
  957.  
  958.     # Ampersand-encoding based entirely on Nat Irons's Amputator MT plugin:
  959.     #   http://bumppo.net/projects/amputator/
  960.     $text =~ s/&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)/&amp;/g;
  961.  
  962.     # Encode naked <'s
  963.     $text =~ s{<(?![a-z/?\$!])}{&lt;}gi;
  964.  
  965.     return $text;
  966. }
  967.  
  968.  
  969. sub _EncodeBackslashEscapes {
  970. #
  971. #   Parameter:  String.
  972. #   Returns:    The string, with after processing the following backslash
  973. #               escape sequences.
  974. #
  975.     local $_ = shift;
  976.  
  977.     s! \\\\  !$g_escape_table{'\\'}!gx;     # Must process escaped backslashes first.
  978.     s! \\`   !$g_escape_table{'`'}!gx;
  979.    s! \\\*  !$g_escape_table{'*'}!gx;
  980.    s! \\_   !$g_escape_table{'_'}!gx;
  981.    s! \\\{  !$g_escape_table{'{'}!gx;
  982.    s! \\\}  !$g_escape_table{'}'}!gx;
  983.    s! \\\[  !$g_escape_table{'['}!gx;
  984.    s! \\\]  !$g_escape_table{']'}!gx;
  985.    s! \\\(  !$g_escape_table{'('}!gx;
  986.    s! \\\)  !$g_escape_table{')'}!gx;
  987.    s! \\>   !$g_escape_table{'>'}!gx;
  988.    s! \\\#  !$g_escape_table{'#'}!gx;
  989.     s! \\\+  !$g_escape_table{'+'}!gx;
  990.     s! \\\-  !$g_escape_table{'-'}!gx;
  991.     s! \\\.  !$g_escape_table{'.'}!gx;
  992.     s{ \\!  }{$g_escape_table{'!'}}gx;
  993.  
  994.     return $_;
  995. }
  996.  
  997.  
  998. sub _DoAutoLinks {
  999.     my $text = shift;
  1000.  
  1001.     $text =~ s{<((https?|ftp):[^'">\s]+)>}{<a href="$1">$1</a>}gi;
  1002.  
  1003.     # Email addresses: <address@domain.foo>
  1004.     $text =~ s{
  1005.         <
  1006.        (?:mailto:)?
  1007.         (
  1008.             [-.\w]+
  1009.             \@
  1010.             [-a-z0-9]+(\.[-a-z0-9]+)*\.[a-z]+
  1011.         )
  1012.         >
  1013.     }{
  1014.         _EncodeEmailAddress( _UnescapeSpecialChars($1) );
  1015.     }egix;
  1016.  
  1017.     return $text;
  1018. }
  1019.  
  1020.  
  1021. sub _EncodeEmailAddress {
  1022. #
  1023. #   Input: an email address, e.g. "foo@example.com"
  1024. #
  1025. #   Output: the email address as a mailto link, with each character
  1026. #       of the address encoded as either a decimal or hex entity, in
  1027. #       the hopes of foiling most address harvesting spam bots. E.g.:
  1028. #
  1029. #     <a href="&#x6D;&#97;&#105;&#108;&#x74;&#111;:&#102;&#111;&#111;&#64;&#101;
  1030. #       x&#x61;&#109;&#x70;&#108;&#x65;&#x2E;&#99;&#111;&#109;">&#102;&#111;&#111;
  1031. #       &#64;&#101;x&#x61;&#109;&#x70;&#108;&#x65;&#x2E;&#99;&#111;&#109;</a>
  1032. #
  1033. #   Based on a filter by Matthew Wickline, posted to the BBEdit-Talk
  1034. #   mailing list: <http://tinyurl.com/yu7ue>
  1035. #
  1036.  
  1037.     my $addr = shift;
  1038.  
  1039.     srand;
  1040.     my @encode = (
  1041.         sub { '&#' .                 ord(shift)   . ';' },
  1042.         sub { '&#x' . sprintf( "%X", ord(shift) ) . ';' },
  1043.         sub {                            shift          },
  1044.     );
  1045.  
  1046.     $addr = "mailto:" . $addr;
  1047.  
  1048.     $addr =~ s{(.)}{
  1049.         my $char = $1;
  1050.         if ( $char eq '@' ) {
  1051.             # this *must* be encoded. I insist.
  1052.             $char = $encode[int rand 1]->($char);
  1053.         } elsif ( $char ne ':' ) {
  1054.             # leave ':' alone (to spot mailto: later)
  1055.             my $r = rand;
  1056.             # roughly 10% raw, 45% hex, 45% dec
  1057.             $char = (
  1058.                 $r > .9   ?  $encode[2]->($char)  :
  1059.                 $r < .45  ?  $encode[1]->($char)  :
  1060.                              $encode[0]->($char)
  1061.             );
  1062.         }
  1063.         $char;
  1064.     }gex;
  1065.  
  1066.     $addr = qq{<a href="$addr">$addr</a>};
  1067.     $addr =~ s{">.+?:}{">}; # strip the mailto: from the visible part
  1068.  
  1069.     return $addr;
  1070. }
  1071.  
  1072.  
  1073. sub _UnescapeSpecialChars {
  1074. #
  1075. # Swap back in all the special characters we've hidden.
  1076. #
  1077.     my $text = shift;
  1078.  
  1079.     while( my($char, $hash) = each(%g_escape_table) ) {
  1080.         $text =~ s/$hash/$char/g;
  1081.     }
  1082.     return $text;
  1083. }
  1084.  
  1085.  
  1086. sub _TokenizeHTML {
  1087. #
  1088. #   Parameter:  String containing HTML markup.
  1089. #   Returns:    Reference to an array of the tokens comprising the input
  1090. #               string. Each token is either a tag (possibly with nested,
  1091. #               tags contained therein, such as <a href="<MTFoo>">, or a
  1092. #               run of text between tags. Each element of the array is a
  1093. #               two-element array; the first is either 'tag' or 'text';
  1094. #               the second is the actual value.
  1095. #
  1096. #
  1097. #   Derived from the _tokenize() subroutine from Brad Choate's MTRegex plugin.
  1098. #       <http://www.bradchoate.com/past/mtregex.php>
  1099. #
  1100.  
  1101.     my $str = shift;
  1102.     my $pos = 0;
  1103.     my $len = length $str;
  1104.     my @tokens;
  1105.  
  1106.     my $depth = 6;
  1107.     my $nested_tags = join('|', ('(?:<[a-z/!$](?:[^<>]') x $depth) . (')*>)' x  $depth);
  1108.     my $match = qr/(?s: <! ( -- .*? -- \s* )+ > ) |  # comment
  1109.                    (?s: <\? .*? \?> ) |              # processing instruction
  1110.                    $nested_tags/ix;                   # nested tags
  1111.  
  1112.     while ($str =~ m/($match)/g) {
  1113.         my $whole_tag = $1;
  1114.         my $sec_start = pos $str;
  1115.         my $tag_start = $sec_start - length $whole_tag;
  1116.         if ($pos < $tag_start) {
  1117.             push @tokens, ['text', substr($str, $pos, $tag_start - $pos)];
  1118.         }
  1119.         push @tokens, ['tag', $whole_tag];
  1120.         $pos = pos $str;
  1121.     }
  1122.     push @tokens, ['text', substr($str, $pos, $len - $pos)] if $pos < $len;
  1123.     \@tokens;
  1124. }
  1125.  
  1126.  
  1127. sub _Outdent {
  1128. #
  1129. # Remove one level of line-leading tabs or spaces
  1130. #
  1131.     my $text = shift;
  1132.  
  1133.     $text =~ s/^(\t|[ ]{1,$g_tab_width})//gm;
  1134.     return $text;
  1135. }
  1136.  
  1137.  
  1138. sub _Detab {
  1139. #
  1140. # Cribbed from a post by Bart Lateur:
  1141. # <http://www.nntp.perl.org/group/perl.macperl.anyperl/154>
  1142. #
  1143.     my $text = shift;
  1144.  
  1145.     $text =~ s{(.*?)\t}{$1.(' ' x ($g_tab_width - length($1) % $g_tab_width))}ge;
  1146.     return $text;
  1147. }
  1148.  
  1149.  
  1150. 1;
  1151.  
  1152. __END__
  1153. =pod
  1154. =head1 NAME
  1155. B<Markdown>
  1156. =head1 SYNOPSIS
  1157. B<Markdown.pl> [ B<--html4tags> ] [ B<--version> ] [ B<-shortversion> ]
  1158.     [ I<file> ... ]
  1159. =head1 DESCRIPTION
  1160. Markdown is a text-to-HTML filter; it translates an easy-to-read /
  1161. easy-to-write structured text format into HTML. Markdown's text format
  1162. is most similar to that of plain text email, and supports features such
  1163. as headers, *emphasis*, code blocks, blockquotes, and links.
  1164. Markdown's syntax is designed not as a generic markup language, but
  1165. specifically to serve as a front-end to (X)HTML. You can  use span-level
  1166. HTML tags anywhere in a Markdown document, and you can use block level
  1167. HTML tags (like <div> and <table> as well).
  1168. For more information about Markdown's syntax, see:
  1169.     http://daringfireball.net/projects/markdown/
  1170. =head1 OPTIONS
  1171. Use "--" to end switch parsing. For example, to open a file named "-z", use:
  1172.     Markdown.pl -- -z
  1173. =over 4
  1174. =item B<--html4tags>
  1175. Use HTML 4 style for empty element tags, e.g.:
  1176.     <br>
  1177. instead of Markdown's default XHTML style tags, e.g.:
  1178.     <br />
  1179. =item B<-v>, B<--version>
  1180. Display Markdown's version number and copyright information.
  1181. =item B<-s>, B<--shortversion>
  1182. Display the short-form version number.
  1183. =back
  1184. =head1 BUGS
  1185. To file bug reports or feature requests (other than topics listed in the
  1186. Caveats section above) please send email to:
  1187.     support@daringfireball.net
  1188. Please include with your report: (1) the example input; (2) the output
  1189. you expected; (3) the output Markdown actually produced.
  1190. =head1 VERSION HISTORY
  1191. See the readme file for detailed release notes for this version.
  1192. 1.0.1 - 14 Dec 2004
  1193. 1.0 - 28 Aug 2004
  1194. =head1 AUTHOR
  1195.     John Gruber
  1196.     http://daringfireball.net
  1197.     PHP port and other contributions by Michel Fortin
  1198.     http://michelf.com
  1199. =head1 COPYRIGHT AND LICENSE
  1200. Copyright (c) 2003-2004 John Gruber  
  1201. <http://daringfireball.net/>  
  1202. All rights reserved.
  1203. Redistribution and use in source and binary forms, with or without
  1204. modification, are permitted provided that the following conditions are
  1205. met:
  1206. * Redistributions of source code must retain the above copyright notice,
  1207.   this list of conditions and the following disclaimer.
  1208. * Redistributions in binary form must reproduce the above copyright
  1209.   notice, this list of conditions and the following disclaimer in the
  1210.   documentation and/or other materials provided with the distribution.
  1211. * Neither the name "Markdown" nor the names of its contributors may
  1212.   be used to endorse or promote products derived from this software
  1213.   without specific prior written permission.
  1214. This software is provided by the copyright holders and contributors "as
  1215. is" and any express or implied warranties, including, but not limited
  1216. to, the implied warranties of merchantability and fitness for a
  1217. particular purpose are disclaimed. In no event shall the copyright owner
  1218. or contributors be liable for any direct, indirect, incidental, special,
  1219. exemplary, or consequential damages (including, but not limited to,
  1220. procurement of substitute goods or services; loss of use, data, or
  1221. profits; or business interruption) however caused and on any theory of
  1222. liability, whether in contract, strict liability, or tort (including
  1223. negligence or otherwise) arising in any way out of the use of this
  1224. software, even if advised of the possibility of such damage.
  1225. =cut
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement