I’m working through converting my blog from Drupal to Jekyll (it’s a long story) and one of the things I needed to do is to convert a bunch of posts originally written in HTML into Markdown. With a little application of PowerShell, most of the heavy lifting was done fairly quickly - leaving just a manual review and tweak of each post.

Here’s the core of the PowerShell script I used:

foreach( $source in (get-childitem .\_posts\*.md )) {
    $sourceName = $source.Name
    Write-Host $sourceName
    # Load the contents of the file as a string 
    $content = get-content $source | join-string -newline
    $content = "$content"
    # Convert Links from <a> to Markdown style
    $content = $content -replace '<a\s+href="([^"]+)">([^<]+)</a>', '[$2]($1)'

    # Convert paragraphs and lists
    $content = $content -replace "\s*<ul>\s*", "`r`n"
    $content = $content -replace "\s*</ul>\s*", "`r`n"
    $content = $content -replace "\s*<ol>\s*", "`r`n"
    $content = $content -replace "\s*</ol>\s*", "`r`n"
    $content = $content -replace "<p>", "`r`n"
    $content = $content -replace "</p>", "`r`n"
    $content = $content -replace "<li>", "`r`n  *  "
    $content = $content -replace "</li>", ""
    # Word wrap each paragraph
    $content = $content -split "`r`n" | foreach-object { wrap-string $_ 120 } | join-string -separator "`r`n"
    # Word/Phrase highlighting    
    $content = $content -replace "<em>", "*"
    $content = $content -replace "</em>", "*"
    $content = $content -replace "<b>", "**"
    $content = $content -replace "</b>", "**"
    $content = $content -replace "<strong>", "**"
    $content = $content -replace "</strong>", "**"
    $content = $content -replace "&quot;", "'"
    $content = $content -replace "<!--break-->", ""
    # Eliminate excess whitespace
    $content = $content -replace "/^\s*$/",""
    $content = $content -replace "`r`n`r`n`r`n","`r`n`r`n"
    $content = $content -replace "`r`n`r`n`r`n","`r`n`r`n"
    $content = $content -replace "`r`n`r`n`r`n","`r`n`r`n"
    $content = $content -replace "`r`n`r`n`r`n","`r`n`r`n"

    set-content .\_processed\$sourceName -value $content 


blog comments powered by Disqus