Monday, March 11, 2019

Use Python to migrate all posts from Google+ to BlogSpot (blogger)

Google+ is going to be closed down very soon.

I built some Python 3.72 script to migrate all posts from Google+ to BlogSpot (blogger).

Below is the steps.

1. Download credentials.json from:

https://developers.google.com/blogger/docs/3.0/using

2. Log in https://www.blogger.com,get BlogId from URL

3. Download Google+ backup,then uncompress it to a folder

4. Modify the "BLOGID" in source code, post files and credentials.json file path(highlighted)

5. Run the script

www.blogger.com only allow uploading around 700 posts per day, so we will get "rate Limit Exceeded" exception during the uploading. This script will automatically re-try uploading the file when that happens.

To differentiate these migrated posts to the original posts, I added ' - GooglePlus' to the end of the post title.

Photos are not migrated.

The source code is shared here.

================

import time
import logging
import os
import fnmatch
from html.parser import HTMLParser

from bs4 import BeautifulSoup
from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request

logging.basicConfig(format='%(asctime)s,%(msecs)d %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s',
                    datefmt='%Y-%m-%d:%H:%M:%S',
                    level=logging.DEBUG)

# FILE_FOLDER = 'C:\\EricFang\\Python3\\'
FILE_FOLDER = '\\\\pwdaddy\\Posts\\'
FILE_CREDENTIALS = 'C:\\EricFang\\Python3\\AddPostsToBlogSpot\\credentials.json'
FILE_PATTERN = '*.html'

BLOGID = 'xxxxxxxxxxxxxxxxxxxxxxxxxxxx'
API_SERVICE_NAME = 'blogger'
API_VERSION = 'v3'

# https://developers.google.com/blogger/docs/3.0/using
# https://developers.google.com/identity/protocols/googlescopes
# SCOPES = ['https://www.googleapis.com/auth/blogger.readonly']
SCOPES = ['https://www.googleapis.com/auth/blogger']

g_appflow = InstalledAppFlow.from_client_secrets_file(
    FILE_CREDENTIALS, SCOPES)
auth_url, _ = g_appflow.authorization_url(prompt='consent')

g_creds = g_appflow.run_local_server()

g_service = build(API_SERVICE_NAME, API_VERSION, credentials=g_creds)

g_count_uploaded = 0


def ReadFile(fileContentText):
    parsed_html = BeautifulSoup(fileContentText, features="html.parser")
    htmlBody = parsed_html.body.find('div', attrs={'class': 'main-content'})
    divData = ''.join(map(str, htmlBody.contents))
    return divData


def addPost(postTitle, postContent):
    postBody = {
        "kind": "blogger#post",
        "id": BLOGID,
        "title": postTitle,
        "content": postContent
    }
    global g_service
    while True:
        try:
            postsInsertAction = g_service.posts().insert(blogId=BLOGID, body=postBody,
                                                         isDraft=False)
            posts = postsInsertAction.execute()
            break
        except Exception as ex:
            logging.info("rateLimitExceeded, wait for 60 seconds......")
            time.sleep(60)

    global g_count_uploaded
    g_count_uploaded = g_count_uploaded + 1
    logging.info(str(g_count_uploaded) + ', post('+postTitle+') is uploaded.')


def getPostTitleList():
    dictReturn = {}
    nextPageToken = None
    global g_service
    postsListAction = g_service.posts().list(blogId=BLOGID, maxResults=20,
                                             fetchBodies=False, pageToken=nextPageToken)
    posts = postsListAction.execute()
    # posts is dict, posts.items is list, posts.items[0] is dict
    while True:
        items = posts['items']
        for item in items:
            itemTitle = item['title']
            dictReturn[itemTitle] = item['id']
            # logging.debug('title='+itemTitle+',id='+item['id'])

        postsListAction = g_service.posts().list(blogId=BLOGID, maxResults=20,
                                                 fetchBodies=False, pageToken=nextPageToken)
        posts = postsListAction.execute()
        if 'nextPageToken' in posts:
            nextPageToken = posts['nextPageToken']
        else:
            break
    return dictReturn


def main():
    postTitleList = getPostTitleList()

    listOfFiles = os.listdir(FILE_FOLDER)
    for fileName in listOfFiles:
        if fnmatch.fnmatch(fileName, FILE_PATTERN) == False:
            logging.info('file ('+fileName+') is not valid. skip.')
            continue

        postTitle = fileName.replace('.html', ' - GooglePlus')
        if postTitle in postTitleList:
            logging.info('post ('+postTitle+') exists. skip.')
            continue

        logging.debug("file name: " + fileName)
        with open(FILE_FOLDER+fileName, mode='r', encoding='UTF-8') as f:
            fileContent = f.read()

        fileContent = ReadFile(fileContentText=fileContent)

        addPost(postTitle=postTitle, postContent=fileContent)

    logging.info('completed!')


if __name__ == '__main__':
    main()

No comments:

Post a Comment