I’m working through converting my blog from Drupal to Jekyll (it’s a long story) and one of the things I needed to do is to convert a bunch of posts originally written in HTML into Markdown. With a little application of PowerShell, most of the heavy lifting was done fairly quickly - leaving just a manual review and tweak of each post.
Here’s the core of the PowerShell script I used:
foreach( $source in (get-childitem .\_posts\*.md )) {
$sourceName = $source.Name
Write-Host $sourceName
# Load the contents of the file as a string
$content = get-content $source | join-string -newline
$content = "$content"
# Convert Links from <a> to Markdown style
$content = $content -replace '<a\s+href="([^"]+)">([^<]+)</a>', '[$2]($1)'
# Convert paragraphs and lists
$content = $content -replace "\s*<ul>\s*", "`r`n"
$content = $content -replace "\s*</ul>\s*", "`r`n"
$content = $content -replace "\s*<ol>\s*", "`r`n"
$content = $content -replace "\s*</ol>\s*", "`r`n"
$content = $content -replace "<p>", "`r`n"
$content = $content -replace "</p>", "`r`n"
$content = $content -replace "<li>", "`r`n * "
$content = $content -replace "</li>", ""
# Word wrap each paragraph
$content = $content -split "`r`n" | foreach-object { wrap-string $_ 120 } | join-string -separator "`r`n"
# Word/Phrase highlighting
$content = $content -replace "<em>", "*"
$content = $content -replace "</em>", "*"
$content = $content -replace "<b>", "**"
$content = $content -replace "</b>", "**"
$content = $content -replace "<strong>", "**"
$content = $content -replace "</strong>", "**"
$content = $content -replace """, "'"
$content = $content -replace "<!--break-->", ""
# Eliminate excess whitespace
$content = $content -replace "/^\s*$/",""
$content = $content -replace "`r`n`r`n`r`n","`r`n`r`n"
$content = $content -replace "`r`n`r`n`r`n","`r`n`r`n"
$content = $content -replace "`r`n`r`n`r`n","`r`n`r`n"
$content = $content -replace "`r`n`r`n`r`n","`r`n`r`n"
set-content .\_processed\$sourceName -value $content
}