#!/usr/bin/env newlisp
;; @module markdown
;; @author cormullion
;; @description a port of John Gruber's Markdown to newLISP
;; @location http://unbalanced-parentheses.nfshost.com/
;; @version of date 2011-10-02 22:36:02
;; version history: at the end
;; a port of John Gruber's Markdown.pl (http://daringfireball.net/markdown) script to newLISP...
;; see his original Perl script for explanations of the fearsome regexen and
;; byzantine logic, etc...
;; TODO:
;; the following Markdown tests fail:
;; Inline HTML (Advanced) ... FAILED
;; Links, reference style ... FAILED -- nested brackets
;; Links, shortcut references ... FAILED
;; Markdown Documentation - Syntax ... FAILED
;; Ordered and unordered lists ... FAILED -- a nested ordered list error
;; parens in url : .jpg) see (Images.text)
;; Add: email address scrambling
(context 'Hash)
(define HashTable:HashTable)
(define (build-escape-table)
(set '*escape-chars* [text]\`*_{}[]()>#+-.![/text])
(dolist (c (explode *escape-chars*))
(HashTable c (hash c))))
(define (init-hash txt)
; finds a hash identifier that doesn't occur anywhere in the text
(set 'counter 0)
(set 'hash-prefix "HASH")
(set 'hash-id (string hash-prefix counter))
(do-while (find hash-id txt)
(set 'hash-id (string hash-prefix (inc counter))))
(Hash:build-escape-table))
(define (hash s)
(HashTable s (string hash-id (inc counter))))
(context 'markdown)
(define (markdown:markdown txt)
(initialize)
(Hash:init-hash txt)
(unescape-special-chars
(block-transforms
(strip-link-definitions
(protect
(cleanup txt))))))
(define (initialize)
(set '*escape-pairs* '(
({\\\\} {\})
({\\`} {`})
({\\\*} {*})
({\\_} {_})
([text]\\\{[/text] [text]{[/text])
([text]\\\}[/text] [text]}[/text])
({\\\[} {[})
({\\\]} {]})
({\\\(} {(})
({\\\)} {)})
({\\>} {>})
({\\\#} {#})
({\\\+} {+})
({\\\-} {-})
({\\\.} {.})
({\\!} {!})))
(set '*hashed-html-blocks* '())
(set '*list-level* 0))
(define (block-transforms txt)
(form-paragraphs
(protect
(block-quotes
(code-blocks
(lists
(horizontal-rules
(headers txt))))))))
(define (span-transforms txt)
(line-breaks
(emphasis
(amps-and-angles
(auto-links
(anchors
(images
(escape-special-chars
(escape-special-chars (code-spans txt) 'inside-attributes)))))))))
(define (tokenize-html xhtml)
; return list of tag/text portions of xhtml text
(letn (
(tag-match [text]((?s:)|
(?s:<\?.*?\?>)|
(?:<[a-z/!$](?:[^<>]|
(?:<[a-z/!$](?:[^<>]|
(?:<[a-z/!$](?:[^<>]|
(?:<[a-z/!$](?:[^<>]|
(?:<[a-z/!$](?:[^<>]|
(?:<[a-z/!$](?:[^<>])*>))*>))*>))*>))*>))*>))[/text]) ; yeah, well...
(str xhtml)
(len (length str))
(pos 0)
(tokens '()))
(while (set 'tag-start (find tag-match str 8))
(if (< pos tag-start)
(push (list 'text (slice str pos (- tag-start pos))) tokens -1))
(push (list 'tag $0) tokens -1)
(set 'str (slice str (+ tag-start (length $0))))
(set 'pos 0))
; leftovers
(if (< pos len)
(push (list 'text (slice str pos (- len pos))) tokens -1))
tokens))
(define (escape-special-chars txt (within-tag-attributes nil))
(let ((temp (tokenize-html txt))
(new-text {}))
(dolist (pair temp)
(if (= (first pair) 'tag)
; 'tag
(begin
(set 'new-text (replace {\\} (last pair) (HashTable {\\}) 0))
(replace [text](?<=.)?code>(?=.)[/text] new-text (HashTable {`}) 0)
(replace {\*} new-text (HashTable {*}) 0)
(replace {_} new-text (HashTable {_} ) 0))
; 'text
(if within-tag-attributes
(set 'new-text (last pair))
(set 'new-text (encode-backslash-escapes (last pair)))))
(setf (temp $idx) (list (first pair) new-text)))
; return as text
(join (map last temp))))
(define (encode-backslash-escapes t)
(dolist (pair *escape-pairs*)
(replace (first pair) t (HashTable (last pair)) 14)))
(define (encode-code s)
; encode/escape certain characters inside Markdown code runs
(replace {&} s "&" 0)
(replace {<} s "<" 0)
(replace {>} s ">" 0)
(replace {\*} s (HashTable {\\}) 0)
(replace {_} s (HashTable {_}) 0)
(replace "{" s (HashTable "{") 0)
(replace {\[} s (HashTable {[}) 0)
(replace {\]} s (HashTable {]}) 0)
(replace {\\} s (HashTable "\\") 0))
(define (code-spans s)
(replace
{(?} (encode-code (trim $2)) {})
2))
(define (encode-alt s)
(replace {&} s "&" 0)
(replace {"} s """ 0))
(define (images txt)
(let ((alt-text {})
(url {})
(title {})
(ref-regex {(!\[(.*?)\][ ]?(?:\n[ ]*)?\[(.*?)\])})
(inline-regex {(!\[(.*?)\]\([ \t]*(\S+?)>?[ \t]*((['"])(.*?)\5[ \t]*)?\))})
(whole-match {})
(result {})
(id-ref {})
(url {}))
; reference links ![alt text][id]
(replace
ref-regex
txt
(begin
(set 'whole-match $1 'alt-text $2 'id-ref $3)
(if alt-text
(replace {"} alt-text {"} 0))
(if (empty? id-ref)
(set 'id-ref (lower-case alt-text)))
(if (lookup id-ref *link-database*)
(set 'url (first (lookup id-ref *link-database*)))
(set 'url nil))
(if url
(begin
(replace {\*} url (HashTable {*}) 0)
(replace {_} url (HashTable {_}) 0)
))
(if (last (lookup id-ref *link-database*))
; title
(begin
(set 'title (last (lookup id-ref *link-database*)))
(replace {"} title {"} 0)
(replace {\*} title (HashTable {*}) 0)
(replace {_} title (HashTable {_}) 0))
; no title
(set 'title {})
)
(if url
(set 'result (string
{}))
(set 'result whole-match))
)
0
)
; inline image refs: 
(replace
inline-regex
txt
(begin
(set 'whole-match $1)
(set 'alt-text $2)
(set 'url $3)
(set 'title $6)
(if alt-text
(replace {"} alt-text {"} 0)
(set 'alt-text {}))
(if title
(begin
(replace {"} title {"} 0)
(replace {\*} title (HashTable {*}) 0)
(replace {_} title (HashTable {_}) 0))
(set 'title {}))
(replace {\*} url (HashTable {*}) 0)
(replace {_} url (HashTable {_}) 0)
(string
{
})
)
0
)
; empty ones are possible
(set '$1 {})
(replace {!\[(.*?)\]\([ \t]*\)}
txt
(string {
})
0)))
(define (make-anchor link-text id-ref )
; Link defs are in the form: ^[id]: url "optional title"
; stored in link db list as (id (url title))
; params are text to be linked and the id of the link in the db
; eg bar 1 for [bar][1]
(let ((title {})
(id id-ref)
(url nil))
(if link-text
(begin
(replace {"} link-text {"} 0)
(replace {\n} link-text { } 0)
(replace {[ ]?\n} link-text { } 0)))
(if (null? id ) (set 'id (lower-case link-text)))
(if (not (nil? (lookup id *link-database*)))
(begin
(set 'url (first (lookup id *link-database*)))
(replace {\*} url (HashTable {*}) 0)
(replace {_} url (HashTable {_}) 0)
(if (set 'title (last (lookup id *link-database*)))
(begin
(replace {"} title {"} 0)
(replace {\*} title (HashTable {*}) 0)
(replace {_} title (HashTable {_}) 0))
(set 'title {})))
(set 'url nil))
(if url
(string {} link-text {})
(string {[} link-text {][} id-ref {]}))))
(define (anchors txt)
(letn ((nested-brackets {(?>[^\[\]]+)*})
(ref-link-regex (string {(\[(} nested-brackets {)\][ ]?(?:\n[ ]*)?\[(.*?)\])}))
(inline-regex {(\[(.*?)\]\([ ]*(.*?\)?)>?[ ]*((['"])(.*?)\5[ \t]*)?\))})
(link-text {})
(url {})
(title {}))
; reference-style links: [link text] [id]
(set '$1 {} '$2 {} '$3 {} '$4 {} '$5 {} '$6 {}) ; i still don't think I should have to do this...
; what about this regex instead?
(set 'ref-link-regex {(\[(.*?)\][ ]?\[(.*?)\])})
(replace ref-link-regex txt (make-anchor $2 $3) 8) ; $2 is link text, $3 is id
; inline links: [link text](url "optional title")
(set '$1 {} '$2 {} '$3 {} '$4 {} '$5 {} '$6 {})
(replace
inline-regex
txt
(begin
(set 'link-text $2)
(set 'url $3)
(set 'title $6)
(if link-text (replace {"} link-text {"} 0))
(if title
(begin
(replace {"} title {"} 0)
(replace {\*} title (HashTable {*}) 0)
(replace {_} title (HashTable {_}) 0))
(set 'title {}))
(replace {\*} url (HashTable {*}) 0)
(replace {_} url (HashTable {_}) 0)
(replace {^<(.*)>$} url $1 0)
(string
{} link-text {}
))
8
) ; replace
) txt)
(define (auto-links txt)
(replace
[text]<((https?|ftp):[^'">\s]+)>[/text]
txt
(string {} $1 {})
0
)
; to-do: email ...
)
(define (amps-and-angles txt)
; Smart processing for ampersands and angle brackets
(replace
[text]&(?!\#?[xX]?(?:[0-9a-fA-F]+|\w+);)[/text]
txt
{&}
10
)
(replace
[text]<(?![a-z/?\$!])[/text]
txt
{<}
10))
(define (emphasis txt)
; italics/bold: strong first
(replace
[text] (\*\*|__) (?=\S) (.+?[*_]*) (?<=\S) \1 [/text]
txt
(string {} $2 {})
8
)
(replace
[text] (\*|_) (?=\S) (.+?) (?<=\S) \1 [/text]
txt
(string {} $2 {})
8
))
(define (line-breaks txt)
; handles line break markers
(replace " {2,}\n" txt "
\n" 0))
(define (hex-str-to-unicode-char strng)
; given a five character string, assume it's "U" + 4 hex chars and convert
; return the character...
(char (int (string "0x" (1 strng)) 0 16)))
(define (ustring s)
; any four digit string preceded by U
(replace "U[0-9a-f]{4,}" s (hex-str-to-unicode-char $0) 0))
(define (cleanup txt)
; cleanup the text by normalizing some possible variations
(replace "\r\n|\r" txt "\n" 0) ; standardize line ends
(push "\n\n" txt -1) ; end with two returns
(set 'txt (detab txt)) ; convert tabs to spaces
; convert inline Unicode:
(set 'txt (ustring txt))
(replace "\n[ \t]+\n" txt "\n\n" 0) ; lines with only spaces and tabs
)
(define (protect txt)
; protect or "hash html blocks"
(letn ((nested-block-regex [text](^<(p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math|ins|del)\b(.*\n)*?\2>[ \t]*(?=\n+|\Z))[/text])
(liberal-tag-regex [text](^<(p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math)\b(.*\n)*?.*\2>[ \t]*(?=\n+|\Z))[/text])
(hr-regex [text](?:(?<=\n\n)|\A\n?)([ ]{0,3}<(hr)\b([^<>])*?/?>[ \t]*(?=\n{2,}|\Z))[/text])
(html-comment-regex [text](?:(?<=\n\n)|\A\n?)([ ]{0,3}(?s:)[ \t]*(?=\n{2,}|\Z))[/text])
(results '())
(chunk-count (length (set 'chunks (parse txt "\n\n"))))
(chunk-size 500))
; due to a limitation in PCRE, long sections have to be divided up otherwise we'll crash
; so divide up long texts into chunks, then do the regex on each chunk
; not an ideal solution, but it works ok :(
(for (i 0 chunk-count chunk-size)
; do a chunk
(set 'text-chunk (join (i (- (min chunk-count (- (+ i chunk-size) 1)) i) chunks) "\n\n"))
(dolist (rgx (list nested-block-regex liberal-tag-regex hr-regex html-comment-regex))
(replace
rgx
text-chunk
(begin
(set 'key (Hash:hash $1))
(push (list key $1 ) *hashed-html-blocks* -1)
(string "\n\n" key "\n\n"))
2))
; save this partial result
(push text-chunk results -1)
) ; for
; return string result
(join results "\n\n")))
(define (unescape-special-chars t)
; Swap back in all the special characters we've hidden.
(dolist (pair (HashTable))
(replace (last pair) t (first pair) 10)) t)
(define (strip-link-definitions txt)
; strip link definitions from the text and store them
; Link defs are in the form: ^[id]: url "optional title"
; stored in link db list as (id (url title))
(let ((link-db '())
(url {})
(id {})
(title {}))
(replace
[text]^[ ]{0,3}\[(.+)\]:[ \t]*\n?[ \t]*(\S+?)>?[ \t]*\n?[ \t]*(?:(?<=\s)["(](.+?)[")][ \t]*)?(?:\n+|\Z)[/text]
txt
(begin
(set 'id (lower-case $1) 'url (amps-and-angles $2) 'title $3)
(if title (replace {"} title {"} 0))
(push (list id (list url title)) link-db)
(set '$3 {}) ; necessary?
(string {}) ; remove from text
)
10)
(set '*link-database* link-db)
txt))
(define (horizontal-rules txt)
(replace
[text]^[ ]{0,2}([ ]?\*[ ]?){3,}[ \t]*$[/text]
txt
"\n
" code-block "\n
\n\n")))))
10)))
(define (block-quotes txt)
(let ((block-quote {}))
(replace
[text]((^[ \t]*>[ \t]?.+\n(.+\n)*\n*)+)[/text]
txt
(begin
(set 'block-quote $1)
(replace {^[ ]*>[ ]?} block-quote {} 2)
(replace {^[ ]+$} block-quote {} 2)
(set 'block-quote (block-transforms block-quote)) ; recurse
; remove leading spaces
(replace
{(\s*.+?)} block-quote (trim $1) 2) (string "
\n" block-quote "\n\n\n")) 2))) (define (outdent s) (replace [text]^(\t|[ ]{1,4})[/text] s {} 2)) (define (detab s) (replace [text](.*?)\t[/text] s (string $1 (dup { } (- 4 (% (length $1) 4)))) 2)) (define (form-paragraphs txt) (let ((grafs '()) (original nil)) (set 'txt (trim txt "\n")) ; strip blank lines before and after (set 'grafs (parse txt "\n{2,}" 0)) ; split (dolist (p grafs) (if (set 'original (lookup p *hashed-html-blocks*)) ; html blocks (setf (grafs $idx) original) ; wrap
tags round everything else (setf (grafs $idx) (string {
} (replace {^[ ]*} (span-transforms p) {} (+ 4 8 16)) {
})))) (join grafs "\n\n"))) [text] ; three command line arguments: let's hope last one is a file (when (= 3 (length (main-args))) (println (markdown (read-file (main-args 2)))) (exit)) ; hack for command-line and module loading (set 'level (sys-info 3)) ; if level is 2, then we're probably invoking markdown.lsp directly ; if level is > 3, then we're probably loading it into another script... (when (= level 2) ; running on command line, read STDIN and execute: (while (read-line) (push (current-line) *stdin* -1)) (println (markdown (join *stdin* "\n"))) (exit)) [/text] ;; version 2011-09-16 16:31:29 ;; Changed to different hash routine. Profiling shows that hashing takes 40% of the execution time. ;; Unfortunately this new version is only very slightly faster. ;; Command-line arguments hack in previous version doesn't work. ;; ;; version 2011-08-18 15:04:40 ;; various fixes, and added hack for running this from the command-line: ;; echo "hi there" | newlisp markdown.lsp ;; echo "hello world" | markdown.lsp ;; cat file.text | newlisp markdown.lsp ;; ;; version 2010-11-14 17:34:52 ;; some problems in ustring. Probably remove it one day, as it's non standard... ;; ;; version 2010-10-14 18:41:38 ;; added code to work round PCRE crash in (protect ... ;; ;; version date 2010-07-10 22:20:25 ;; modified call to 'read' since lutz has changed it ;; ;; version date 2009-11-16 22:10:10 ;; fixed bug in tokenize.html ;; ;; version date 2008-10-08 18:44:46 ;; changed nth-set to setf to be version-10 ready. ;; This means that now this script will NOT work with ;; earlier versions of newLISP!!!!!!!!!!! ;; requires Nestor if you want source code colouring... ;; ;; version date 2008-08-08 16:54:56 ;; changed (unless to (if (not ... :( ;; ;; version date 2008-07-20 14:!2:29 ;; added hex-str-to-unicode-char ustring ;; ;; version date 2008-03-07 15:36:09 ;; fixed load error ;; ;; version date 2007-11-17 16:20:57 ;; added syntax colouring module ;; ;; version date 2007-11-14 09:19:42 ;; removed reliance on dostring for compatibility with 9.1 ; eof