One day I realized I need the images from one the MS Whiteboards. There were my photos anyway. Surprisingly, there was no option in the UI to extract them directly, so - as usual - I ended up using Python, and - as usual - it turned out to be quite straightforward.
To extract the images, first export your Whiteboard as a ZIP file, then run the script:
python whiteboard_extract.py MyBoard.zip
Optional output folder:
python whiteboard_extract.py MyBoard.zip --out ./whiteboard_imgs
The script does the following:
- Extracts the HTML file from the archive
- Finds all inline images having
class="ms-Image-image" - Converts the Base64 data to a binary blob
- Automatically detects the file type (supported: SVG, PNG, JPEG)
- Writes each image to disk using the detected file extension
The code is available in this GitHub Gist
#!/usr/bin/env python3
from zipfile import ZipFile
import base64
import logging
import argparse
import sys
from pathlib import Path
from typing import Iterable, Optional
try:
from bs4 import BeautifulSoup
except ImportError as e:
print("Missing dependency: beautifulsoup4. Install with: pip install beautifulsoup4", file=sys.stderr)
sys.exit(2)
try:
import filetype
except ImportError as e:
print("Missing dependency: filetype. Install with: pip install filetype", file=sys.stderr)
sys.exit(2)
def parse_args(argv: Optional[Iterable[str]] = None) -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Extract inline data-URI images from a Microsoft Whiteboard export (.zip)."
)
parser.add_argument('archive', type=Path, help="Path to the Whiteboard export .zip")
parser.add_argument(
"-o",
"--output-dir",
type=Path,
help="Directory to write images (default: directory of the archive)",
)
parser.add_argument(
"-v",
"--verbose",
action="count",
default=0,
help="Increase verbosity (-v, -vv)",
)
parser.add_argument(
"-q", "--quiet", action="store_true", help="Only print errors"
)
return parser.parse_args(argv)
def configure_logging(verbose: int, quiet: bool) -> None:
if quiet:
level = logging.ERROR
else:
level = logging.WARNING
if verbose == 1:
level = logging.INFO
elif verbose >= 2:
level = logging.DEBUG
logging.basicConfig(level=level, format="%(levelname)s: %(message)s")
def run(archive: Path, out_dir: Optional[Path]) -> int:
output_dir = out_dir or archive.resolve().parent
if not archive.exists():
logging.error("Archive not found: %s", archive)
return 1
try:
input_zip = ZipFile(archive)
except Exception as e:
logging.error(f"{archive}: {e.args[0]}")
exit(-1)
html_content = None
for name in input_zip.namelist():
if name.endswith(".html"):
html_content = input_zip.read(name).decode("utf-8")
break
if html_content is None:
logging.error(f"Can't find an *.html file inside of the archive")
exit(-1)
soup = BeautifulSoup(html_content, 'html.parser')
# Find all images
for img in soup.find_all('img', {'class': 'ms-Image-image'}):
try:
data_hdr, data_str = img.attrs["src"].split(";base64,")
except ValueError:
logging.warning(f"Found an image without base64 data, id = {img.attrs["id"]}")
continue
binary_data = base64.b64decode(data_str)
file_ext = "bin"
if data_hdr == "data:image/svg+xml":
file_ext = "svg"
elif data_hdr == "data:text/plain":
data_type = filetype.guess(binary_data)
if data_type is not None:
file_ext = data_type.EXTENSION
else:
logging.warning(f"Couldn't guess file type for {img.attrs["id"]}, assuming binary")
else:
logging.warning(f"Unknown data header for {img.attrs["id"]}: {data_hdr}, assuming binary")
file_name = output_dir / f'{img.attrs["id"]}.{file_ext}'
logging.info(f"Exporting to {file_name}")
with open(file_name, 'wb') as output_file:
output_file.write(binary_data)
return 0
def main(argv: Optional[Iterable[str]] = None) -> None:
args = parse_args(argv)
configure_logging(args.verbose, args.quiet)
exit_code = run(
archive=args.archive,
out_dir=args.output_dir,
)
sys.exit(exit_code)
if __name__ == "__main__":
main()