Whiteboard Image Extractor (Python)

One day I realized I need the images from one the MS Whiteboards. There were my photos anyway. Surprisingly, there was no option in the UI to extract them directly, so - as usual - I ended up using Python, and - as usual - it turned out to be quite straightforward.

To extract the images, first export your Whiteboard as a ZIP file, then run the script:

python whiteboard_extract.py MyBoard.zip

Optional output folder:

python whiteboard_extract.py MyBoard.zip --out ./whiteboard_imgs

The script does the following:

Extracts the HTML file from the archive
Finds all inline images having class="ms-Image-image"
Converts the Base64 data to a binary blob
Automatically detects the file type (supported: SVG, PNG, JPEG)
Writes each image to disk using the detected file extension

The code is available in this GitHub Gist

#!/usr/bin/env python3
from zipfile import ZipFile
import base64
import logging
import argparse
import sys
from pathlib import Path
from typing import Iterable, Optional

try:
    from bs4 import BeautifulSoup
except ImportError as e:
    print("Missing dependency: beautifulsoup4. Install with: pip install beautifulsoup4", file=sys.stderr)
    sys.exit(2)

try:
    import filetype
except ImportError as e:
    print("Missing dependency: filetype. Install with: pip install filetype", file=sys.stderr)
    sys.exit(2)

def parse_args(argv: Optional[Iterable[str]] = None) -> argparse.Namespace:
    parser = argparse.ArgumentParser(
        description="Extract inline data-URI images from a Microsoft Whiteboard export (.zip)."
    )

    parser.add_argument('archive', type=Path, help="Path to the Whiteboard export .zip")

    parser.add_argument(
        "-o",
        "--output-dir",
        type=Path,
        help="Directory to write images (default: directory of the archive)",
    )
    parser.add_argument(
        "-v",
        "--verbose",
        action="count",
        default=0,
        help="Increase verbosity (-v, -vv)",
    )
    parser.add_argument(
        "-q", "--quiet", action="store_true", help="Only print errors"
    )

    return parser.parse_args(argv)

def configure_logging(verbose: int, quiet: bool) -> None:
    if quiet:
        level = logging.ERROR
    else:
        level = logging.WARNING
        if verbose == 1:
            level = logging.INFO
        elif verbose >= 2:
            level = logging.DEBUG
    logging.basicConfig(level=level, format="%(levelname)s: %(message)s")

def run(archive: Path, out_dir: Optional[Path]) -> int:
    output_dir = out_dir or archive.resolve().parent

    if not archive.exists():
        logging.error("Archive not found: %s", archive)
        return 1

    try:
        input_zip = ZipFile(archive)
    except Exception as e:
        logging.error(f"{archive}: {e.args[0]}")
        exit(-1)

    html_content = None
    for name in input_zip.namelist():
        if name.endswith(".html"):
            html_content = input_zip.read(name).decode("utf-8")
            break

    if html_content is None:
        logging.error(f"Can't find an *.html file inside of the archive")
        exit(-1)

    soup = BeautifulSoup(html_content, 'html.parser')

    # Find all images
    for img in soup.find_all('img', {'class': 'ms-Image-image'}):

        try:
            data_hdr, data_str = img.attrs["src"].split(";base64,")
        except ValueError:
            logging.warning(f"Found an image without base64 data, id = {img.attrs["id"]}")
            continue

        binary_data = base64.b64decode(data_str)

        file_ext = "bin"
        if data_hdr == "data:image/svg+xml":
            file_ext = "svg"
        elif data_hdr == "data:text/plain":
            data_type = filetype.guess(binary_data)
            if data_type is not None:
                file_ext = data_type.EXTENSION
            else:
                logging.warning(f"Couldn't guess file type for {img.attrs["id"]}, assuming binary")
        else:
            logging.warning(f"Unknown data header for {img.attrs["id"]}: {data_hdr}, assuming binary")

        file_name = output_dir / f'{img.attrs["id"]}.{file_ext}'
        logging.info(f"Exporting to {file_name}")
        with open(file_name, 'wb') as output_file:
            output_file.write(binary_data)

    return 0

def main(argv: Optional[Iterable[str]] = None) -> None:
    args = parse_args(argv)
    configure_logging(args.verbose, args.quiet)
    exit_code = run(
        archive=args.archive,
        out_dir=args.output_dir,
    )
    sys.exit(exit_code)


if __name__ == "__main__":
    main()

Whiteboard Image Extractor (Python)

Similar Posts