Migration
When I tried to migrate my WordPress site, I was initially limited in exporting XML files from my WordPress site, which presented some challenges for my migration process. To tackle this, I decided to develop a solution using a Python script that I created with some assistance from ChatGPT.
Thanks to this approach, I was able to streamline the entire migration process significantly. Instead of spending countless hours manually editing the files or dealing with formatting issues, I could successfully migrate all my WordPress posts in two to three hours. However, I still had to download all the images and manually insert them into the new site. While the process was efficient overall, that extra step would require time and effort to ensure everything looked right. Overall, I was pleased with how efficiently half of the migration went!
import xml.etree.ElementTree as ET
from datetime import datetime
import os
import html2text
import re
def xml_to_markdown(xml_file, output_dir):
# Parse the XML file
tree = ET.parse(xml_file)
root = tree.getroot()
# Create the output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)
# Initialize the HTML to Markdown converter
h2t = html2text.HTML2Text()
h2t.ignore_links = False # Include links in Markdown
h2t.body_width = 0 # Preserve original line width
# Regex to match wp:heading with a level
heading_regex = r"<!-- wp:heading {\"level\":(\d)} -->.*?<h\d.*?>(.*?)</h\d>"
# Regex to match wp:spacer and ignore it
spacer_regex = r"<!-- wp:spacer.*?-->.*?<div.*?aria-hidden.*?class.*?>.*?</div>"
# Iterate through each <item>
for item in root.findall('item'):
# Extract title
title = item.find('title').text.strip() if item.find('title') is not None else "Untitled"
# Extract and format the date, default to "2020-01-01" if not present
date_str = item.find('date').text.strip() if item.find('date') is not None else "2020-01-01 00:00:00"
try:
pub_date = datetime.strptime(date_str, "%Y-%m-%d %H:%M:%S").strftime("%Y-%m-%d")
except ValueError:
pub_date = "2020-01-01" # Fallback if date format is invalid
# Set draft to True if date is missing or set to "2020-01-01"
draft = pub_date == "2020-01-01"
# Extract and handle content
content_html = item.find('content').text.strip() if item.find('content') is not None else ""
# Replace wp:spacer tags with nothing (ignore them)
content_html = re.sub(spacer_regex, '', content_html)
# Process content to separate headings and paragraphs
headings = []
paragraphs = []
# Replace wp:heading tags with Markdown headings and separate them from the content
def replace_heading(match):
level = int(match.group(1)) # Extract heading level
heading_content = match.group(2) # Extract the heading text
# Clean the heading content (remove HTML tags like <strong>, <em>, etc.)
heading_content = re.sub(r'<.*?>', '', heading_content)
headings.append(f"\n{'#' * level} {heading_content}") # Store the heading in a list
return "" # Remove heading from content (handled separately)
# Apply heading replacements
content_html = re.sub(heading_regex, replace_heading, content_html)
# Convert the remaining content to Markdown using html2text (for paragraphs)
content_md = h2t.handle(content_html)
# Remove any **strong** formatting (replace ** with nothing)
content_md = re.sub(r'\*\*(.*?)\*\*', r'\1', content_md) # Removes bold Markdown
# Split the content_md into paragraphs (as html2text adds extra newlines)
paragraphs = [line for line in content_md.strip().splitlines() if line.strip()]
# Combine the headings and paragraphs in the correct order
combined_content = '\n'.join(headings + paragraphs)
# Generate filename based on pubDate and title
safe_title = title.replace(" ", "_").replace("/", "-").replace("?", "").replace(":", "")[:50]
filename = f"{pub_date}_{safe_title}.md"
# Generate Markdown content with frontmatter
md_content = f"---\n"
md_content += f"title: \"{title}\"\n"
md_content += f"pubDate: \"{pub_date}\"\n"
md_content += f"description: \"\"\n" # Add empty description field
# Add draft field if the date is missing or set to 2020-01-01
if draft:
md_content += f"draft: true\n"
md_content += f"---\n\n"
# Add combined content to the markdown file
md_content += combined_content
# Write to Markdown file
with open(os.path.join(output_dir, filename), 'w', encoding='utf-8') as f:
f.write(md_content)
# Specify input XML file and output directory
input_xml = "wpiedata.xml"
output_directory = "markdown_files"
# Run the conversion
xml_to_markdown(input_xml, output_directory)
print(f"Markdown files have been created in the '{output_directory}' directory.")