Migration

When I tried to migrate my WordPress site, I was initially limited in exporting XML files from my WordPress site, which presented some challenges for my migration process. To tackle this, I decided to develop a solution using a Python script that I created with some assistance from ChatGPT.

Thanks to this approach, I was able to streamline the entire migration process significantly. Instead of spending countless hours manually editing the files or dealing with formatting issues, I could successfully migrate all my WordPress posts in two to three hours. However, I still had to download all the images and manually insert them into the new site. While the process was efficient overall, that extra step would require time and effort to ensure everything looked right. Overall, I was pleased with how efficiently half of the migration went!

import xml.etree.ElementTree as ET
from datetime import datetime
import os
import html2text
import re

def xml_to_markdown(xml_file, output_dir):
    # Parse the XML file
    tree = ET.parse(xml_file)
    root = tree.getroot()

    # Create the output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    # Initialize the HTML to Markdown converter
    h2t = html2text.HTML2Text()
    h2t.ignore_links = False  # Include links in Markdown
    h2t.body_width = 0  # Preserve original line width

    # Regex to match wp:heading with a level
    heading_regex = r"<!-- wp:heading {\"level\":(\d)} -->.*?<h\d.*?>(.*?)</h\d>"

    # Regex to match wp:spacer and ignore it
    spacer_regex = r"<!-- wp:spacer.*?-->.*?<div.*?aria-hidden.*?class.*?>.*?</div>"

    # Iterate through each <item>
    for item in root.findall('item'):
        # Extract title
        title = item.find('title').text.strip() if item.find('title') is not None else "Untitled"
        
        # Extract and format the date, default to "2020-01-01" if not present
        date_str = item.find('date').text.strip() if item.find('date') is not None else "2020-01-01 00:00:00"
        try:
            pub_date = datetime.strptime(date_str, "%Y-%m-%d %H:%M:%S").strftime("%Y-%m-%d")
        except ValueError:
            pub_date = "2020-01-01"  # Fallback if date format is invalid

        # Set draft to True if date is missing or set to "2020-01-01"
        draft = pub_date == "2020-01-01"

        # Extract and handle content
        content_html = item.find('content').text.strip() if item.find('content') is not None else ""

        # Replace wp:spacer tags with nothing (ignore them)
        content_html = re.sub(spacer_regex, '', content_html)

        # Process content to separate headings and paragraphs
        headings = []
        paragraphs = []

        # Replace wp:heading tags with Markdown headings and separate them from the content
        def replace_heading(match):
            level = int(match.group(1))  # Extract heading level
            heading_content = match.group(2)  # Extract the heading text
            # Clean the heading content (remove HTML tags like <strong>, <em>, etc.)
            heading_content = re.sub(r'<.*?>', '', heading_content)
            headings.append(f"\n{'#' * level} {heading_content}")  # Store the heading in a list
            return ""  # Remove heading from content (handled separately)

        # Apply heading replacements
        content_html = re.sub(heading_regex, replace_heading, content_html)

        # Convert the remaining content to Markdown using html2text (for paragraphs)
        content_md = h2t.handle(content_html)

        # Remove any **strong** formatting (replace ** with nothing)
        content_md = re.sub(r'\*\*(.*?)\*\*', r'\1', content_md)  # Removes bold Markdown

        # Split the content_md into paragraphs (as html2text adds extra newlines)
        paragraphs = [line for line in content_md.strip().splitlines() if line.strip()]

        # Combine the headings and paragraphs in the correct order
        combined_content = '\n'.join(headings + paragraphs)

        # Generate filename based on pubDate and title
        safe_title = title.replace(" ", "_").replace("/", "-").replace("?", "").replace(":", "")[:50]
        filename = f"{pub_date}_{safe_title}.md"

        # Generate Markdown content with frontmatter
        md_content = f"---\n"
        md_content += f"title: \"{title}\"\n"
        md_content += f"pubDate: \"{pub_date}\"\n"
        md_content += f"description: \"\"\n"  # Add empty description field
        
        # Add draft field if the date is missing or set to 2020-01-01
        if draft:
            md_content += f"draft: true\n"
        
        md_content += f"---\n\n"

        # Add combined content to the markdown file
        md_content += combined_content

        # Write to Markdown file
        with open(os.path.join(output_dir, filename), 'w', encoding='utf-8') as f:
            f.write(md_content)

# Specify input XML file and output directory
input_xml = "wpiedata.xml"
output_directory = "markdown_files"

# Run the conversion
xml_to_markdown(input_xml, output_directory)

print(f"Markdown files have been created in the '{output_directory}' directory.")