#!/usr/bin/env python
'''
------------------------------------------------------------------
Copyright (c) Kevin Whitefoot 2010, <kwhitefo@gmail.com>
Licensed under Gnu Public license version 2 or later.
------------------------------------------------------------------
A tool to convert the output of emacs-wiki to blogger entries.
The basic idea is to automatically post the html pages created by
emacs-wiki to a blogger account.
The following abilities need to be provided:
- post only if local copy is newer,
- rewrite links so that they still work online,
- sanitize the html so that blogger will accept it.
To make it easy I will use google's python gdata api interface.
Possible milestones:
1: Post without rewriting or sanitizing,
2: Add sanitizing
3: Add link rewriting
Dependencies:
- uses tidy to sanitize the html,
- google's gdata api python wrapper.
'''
__author__ = 'kwhitefo@gmail.com <Kevin Whitefoot>'
# Python imports
import os
import re
import commands
import sys
import getopt
# gdata imports
import gdata.service
import gdata.blogger.client
import gdata.client
import gdata.sample_util
import gdata.data
import atom.data
class WikiBlogger:
def __init__(self, wiki_index, user, pw, connect):
"""Creates a GDataService and provides ClientLogin auth details to it.
The email and password are required arguments for ClientLogin. The
'source' defined below is an arbitrary string, but should be used to
reference your name or the name of your organization, the app name and
version, with '-' between each of the three values.
The connect argument allows offline testing.
"""
# Load the dictionary that holds the association between
# filenames and post ids
self.wiki_index = os.path.basename(wiki_index)
self.wiki_dir = os.path.dirname(wiki_index)
self.blogger_post_ids_fn = os.path.join(self.wiki_dir, '.blogger-post-ids')
self.LoadPostIds()
self.uploaded = [] # Track what has been uploaded so that we
# don't end up in endless loops .
if connect:
# Authenticate using ClientLogin.
self.client = gdata.blogger.client.BloggerClient()
service='blogger'
source='kjw-wikiblogger-0.1',
self.client.client_login(user, pw, source=source, service=service)
# Get the blog ID for the first blog.
feed = self.client.get_blogs()
self.blog = feed.entry[0]
self.blog_id = self.blog.get_blog_id()
def PrintUserBlogTitles(self):
"""Prints a list of all the user's blogs."""
# Request the feed.
feed = self.client.get_blogs()
# Print the results.
print feed.title.text
for entry in feed.entry:
print "\t" + entry.title.text
print
def CreatePost(self, title, content, is_draft):
"""This method creates a new post on a blog. The new post can
be stored as a draft or published based on the value of the
is_draft parameter. The method creates an GDataEntry for the
new post using the title, content, author_name and is_draft
parameters. With is_draft, True saves the post as a draft,
while False publishes the post. Then it uses the given
GDataService to insert the new post. If the insertion is
successful, the added post (GDataEntry,
gdata-2.0.7/pydocs/gdata.html#GDataEntry) will be returned.
"""
return self.client.add_post(self.blog_id, title, content, draft=is_draft)
def LoadPosts(self):
""" Requests the posts feed for the blogs and returns the
entries. """
# Request the feed.
feed = self.client.get_posts(self.blog_id)
self.entries = {}
for entry in feed.entry:
self.entries[entry.post_id] = entry
def PrintPostsInDateRange(self, start_time, end_time):
"""This method displays the title and modification time for any posts that
have been created or updated in the period between the start_time and
end_time parameters. The method creates the query, submits it to the
GDataService, and then displays the results.
Note that while the start_time is inclusive, the end_time is exclusive, so
specifying an end_time of '2007-07-01' will include those posts up until
2007-6-30 11:59:59PM.
The start_time specifies the beginning of the search period (inclusive),
while end_time specifies the end of the search period (exclusive).
"""
# Create query and submit a request.
query = gdata.blogger.client.Query(updated_min=start_time,
updated_max=end_time,
order_by='updated')
print query.updated_min
print query.order_by
feed = self.client.get_posts(self.blog_id, query=query)
# Print the results.
print feed.title.text + " posts between " + start_time + " and " + end_time
print feed.title.text
for entry in feed.entry:
if not entry.title.text:
print "\tNo Title"
else:
print "\t" + entry.title.text
print
def UpdatePostTitle(self, entry_to_update, new_title):
"""This method updates the title of the given post. The GDataEntry object
is updated with the new title, then a request is sent to the GDataService.
If the insertion is successful, the updated post will be returned.
Note that other characteristics of the post can also be modified by
updating the values of the entry object before submitting the request.
The entry_to_update is a GDatEntry containing the post to update.
The new_title is the text to use for the post's new title. Returns: a
GDataEntry containing the newly-updated post.
"""
# Set the new title in the Entry object
entry_to_update.title = atom.data.Title(type='xhtml', text=new_title)
return self.client.update(entry_to_update)
def CreateComment(self, post_id, comment_text):
"""This method adds a comment to the specified post. First the comment
feed's URI is built using the given post ID. Then a GDataEntry is created
for the comment and submitted to the GDataService. The post_id is the ID
of the post on which to post comments. The comment_text is the text of the
comment to store. Returns: an entry containing the newly-created comment
NOTE: This functionality is not officially supported yet.
"""
return self.client.add_comment(self.blog_id, post_id, comment_text)
def PrintAllComments(self, post_id):
"""This method displays all the comments for the given post. First the
comment feed's URI is built using the given post ID. Then the method
requests the comments feed and displays the results. Takes the post_id
of the post on which to view comments.
"""
feed = self.client.get_post_comments(self.blog_id, post_id)
# Display the results
print feed.title.text
for entry in feed.entry:
print "\t" + entry.title.text
print "\t" + entry.updated.text
print
def DeleteComment(self, comment_entry):
"""This method removes the comment specified by the given edit_link_href, the
URI for editing the comment.
"""
self.client.delete(comment_entry)
def DeletePost(self, post_entry):
"""This method removes the post specified by the given edit_link_href, the
URI for editing the post.
"""
self.client.delete(post_entry)
def LoadPostIds(self):
"""Load the file that associates files and postids.
"""
fn = self.blogger_post_ids_fn
if os.path.exists(fn):
f = open(fn, "r")
t = f.read()
f.close()
try:
self.post_ids = eval(t)
except:
print "Warning corrupt status file, resetting"
self.post_ids = {}
else:
print "post_ids file does not exist: ", fn
self.post_ids = {}
def SavePostIds(self):
"""Saves the file that associates files and postids.
"""
fn = self.blogger_post_ids_fn
f = open(fn, "w")
f.write(self.post_ids.__repr__())
f.close()
def Upload(self, max_files):
"""Upload changed files. Max_files allows us to pace
ourselves to avoid hitting the Blogger limit of 50 upload a
day.
"""
print "Upload"
for fn in os.listdir(self.wiki_dir):
print "fn: ", fn
if self.qUpload(fn):
max_files -= 1
if max_files <= 0:
print "Uploaded max. files"
break
def UploadFromPending(self, max_files):
"""
The queue must be primed with the name of the index file.
"""
print "UploadFromPending"
self.pending = self.PrimePending()
while self.pending:
fn = self.pending.pop()
# Note short circuit in next line.
if not fn in self.uploaded and self.qUpload(fn):
max_files -= 1
if max_files <= 0:
print "Uploaded max. files"
break
def qUpload(self, fn):
"""Upload file if newer than entry on blog.
"""
print "fn: ", fn
# if (fn[0:1] == '.' or fn[-5:] != '.html'):
# # don't attempt to upload control files or non-html files.
# print "Ignoring: ", fn
# return False
id_time = self.post_ids.get(fn)
full_name = os.path.join(self.wiki_dir, fn)
if not os.path.exists(full_name):
# no such file
print "Ignoring non-existent file: ", full_name
return False
mtime = os.stat(full_name).st_mtime
# Load and rewrite here so that we can get the list of links
# in the document.
title, body = self.LoadAndRewrite(full_name, fn)
self.QueueLinks(full_name)
post = None
if id_time is None:
# not present so upload
post = self.UploadOne(full_name, fn, title, body)
else:
# Present, check if changed
if len(id_time) == 2:
# Old style post_ids file did not have posted_url.
#posted_url = ""
posted_id, post_time = id_time
else:
posted_id, posted_url, post_time = id_time
if post_time < mtime:
# present but out of date
post = self.UpdateOne(full_name, fn, posted_id)
if post is None:
# Actually it wasn't present after all, presumably
# deleted by owner.
post = self.UploadOne(full_name, fn, title, body)
else:
self.uploaded.append(fn)
print "Already up to date: ", fn
if post is None:
# didn't do anything
return False
else:
# uploaded or updated so update the record
self.post_ids[fn] = (post.get_post_id(), post.FindAlternateLink(), mtime)
self.SavePostIds()
return True
def UploadOne(self, full_name, fn, title, body):
"""Upload file.
"""
print "Upload ", full_name
post = self.CreatePost(title, body, False)
print "Successfully created public post: \"" + post.title.text + "\".\n"
print "post.__str__", post.__str__
self.uploaded.append(fn)
# Get the post ID. To enable us to update later we need
# this to be associated with the file.
return post
def LoadAndRewrite(self, full_name, fn):
"""
Use tidy to ensure that non-Ascii characters are replaced with
entities. If you don't the Blogger API might choke on
non-UTF8 characters.
It can happen that Tidy will report that the HTML is not
fixable. This usually happens because of faulty <example> or
<verbatim> tags. For instance if one forgets the slash on the
closing tag then emacs-wiki will publish without complaint but
the HTML will be invalid.
"""
#f = open(full_name, "r")
#html = f.read()
if full_name[-5:] != ".html":
# highlight non-html file
cmd = "pygmentize -O full,style=emacs -f html '" + full_name + "'"
status, html = commands.getstatusoutput(cmd)
return fn, html
# For HTML files assume that they come from the wiki so that
# we want to strip the emacs-wiki header.
status, html = commands.getstatusoutput('tidy -quiet "' + full_name + '"')
print "Tidy status: ", status
status = status >> 8
if status == 2:
# Treat Tidy errors as fatal.
print "Tidy reported error in " + full_name
sys.exit(2)
# print "html: " , html
title = re.findall("<title>(.*)</title>", html)
print "full_name: ", full_name
print "title: ", title
body = re.findall(r"<!-- Page published by Emacs Wiki begins here -->(.*)</body>",
html, re.DOTALL)
# print "body: ", body
body = self.RewriteLinks(body[0].strip())
return title[0].strip(), body
def RewriteLinks(self, html):
"""Replace the local hrefs with the addresses recorded in the
post_ids.
"""
#print "ids"
#print "html: ", html
#print self.post_ids.items()
for item in self.post_ids.items():
fn, ids = item
if 2 < len(ids):
# Original did not have url
html = self.RewriteLink(html, fn, ids[1])
return html
def RewriteLink(self, html, filename, replacement):
"""Replace the local hrefs with the addresses recorded in the
post_ids.
Is there a more efficient way? Could create a pattern for
each filename when we load the post_ids. However as the
principal use case is the updating of one or very few files
this won't save much time.
"""
#print "RewriteLink", filename, replacement
search_pattern = r'(<a *href *= *")' + filename + r'(" *>.*</a>)'
#print "ps: ", search_pattern
#pattern = re.compile(search_pattern)
#print "Pattern: ", str(search_pattern)
return re.sub(search_pattern,
r"\1" + replacement + r"\2",
html)
def QueueLinks(self, full_name):
"""
Search the html for links to local files. Add them to the
queue for checking.
"""
html = open(full_name).read()
search_pattern = r'(<a *href *= *")(.*?)(" *>.*?</a>)'
links = re.findall(search_pattern,
html)
print "QueueLinks: ", full_name
for link in links:
href = link[1]
print "href", href
print href.find(".."), href[0:1]
print href.find("..") != -1 or href[0:1] == "/"
if href.find("..") != -1 or href[0:1] == "/" :
# Insist that we do not try to go back up the tree by
# prohibiting dot-dot. Finally, prohibit absolute
# paths by checking for leading slashes.
# Regard either as fatal.
print "File contains links to local files outside the starting directory"
sys.exit(2)
if href == '':
# Ignore null url
print "Ignoring empty href in: ", link
elif href.find(":") == -1:
# No colon so no protocol which is good enough in this
# application to say that this is a local file once we
# have weeded out the tree climbers and absolutes..
print "Add to Pending: ", href
self.pending.add(href)
else:
print "Ignoring external link: ", href
print "queue: ", self.pending
print "QueueLinks end"
def PrimePending(self):
queue = set()
# Add the relative paths of all the files known to have been
# uploaded already so that they will be checked even if the
# files that refer to them have not changed.
for k, v in self.post_ids.items():
queue.add(k)
# Add the name of the index file.
queue.add(self.wiki_index)
return queue
def GetpostByID(self, post_id):
"""Fetch a post to be updated. See
http://stackoverflow.com/questions/2152112/blogger-python-api-how-do-i-retrieve-a-post-by-post-id,
http://blog.oddbit.com/2010/01/retrieving-blogger-posts-by-post-id.html
"""
print "GetpostByID", post_id
try:
return self.client.get_feed(
self.blog.get_post_link().href + '/%s' % post_id,
auth_token=self.client.auth_token,
desired_class=gdata.blogger.data.BlogPost)
except gdata.client.RequestError, inst:
print "Exception thrown:"
print type(inst) # the exception instance
print inst # __str__ allows args to printed directly
print "dir: ", dir(inst)
return None
except Exception, inst:
print "Failed to get post by id for unexpected reason."
print inst # __str__ allows args to printed directly
print "dir: ", dir(inst)
raise # do not handle
def UpdateOne(self, full_name, fn, post_id):
"""Update a post.
"""
print "Update ", full_name
f = open(full_name, "r")
t = f.read()
post = self.GetpostByID(post_id)
# print "post: ", post
# print "dir: ", dir(post)
# print "link:", post.GetSelfLink()
# print "link:", post.GetPostLink()
# print "link:", post.link
# print "link:", post.get_html_link()
# print "link:", post.FindUrl()
print "url: ", post.FindAlternateLink()
#print "link:", post.find_self_link()
if post is None:
return None
post.text = t
post.AddLabel("wikiblogger")
# category = atom.Category(term='wikiblogger',
# scheme="http://www.blogger.com/atom/ns#")
# post.category.append(category)
self.client.update(post)
self.uploaded.append(fn)
return post
def show_usage(msg):
print "Error: ", msg
print "Usage:"
print ('./wikiblogger.py --src [srcdir] --user [emailaddress] --password [password]')
sys.exit(2)
def main():
"""
"""
# Process options (thanks to http://code.activestate.com/recipes/576441/)
try:
opts, args = getopt.getopt(sys.argv[1:], "", ["src=", "user=", "password="])
except getopt.error, msg:
show_usage(msg)
user = ''
pw = ''
src = ''
for o, a in opts:
if o == "--user":
user = a
elif o == "--password":
pw = a
elif o == "--src":
src = a
if user == '' or pw == '' or src == '':
show_usage('')
print "src: ", src
print "user: ", user
print "pw: ", pw
wb = WikiBlogger(os.path.expanduser(src), user, pw, True) # TODO: externalise
#wb.Upload(10)
wb.UploadFromPending(10)
if __name__ == '__main__':
main()
Tuesday, 16 March 2010
nonwiki/wikiblogger/wikiblogger.py
Subscribe to:
Post Comments (Atom)
Blog Archive
-
▼
2010
(286)
-
▼
March
(188)
- Boo Ks
- Latin Dictionary G
- Dot Plot
- Not Ant
- Irrelevant Asides
- Basic Latin
- Latin Notes
- Csharp Layout
- Math Cad
- Stretch Image
- Current Version
- My Software
- Cell Tracking
- Latin Dictionary
- Live Maths Feb 2008
- Code Quality
- Basic Latin
- Basic Latin
- Basic Latin
- Non Programming
- Emacs Wiki
- Latin Dictionary M
- Latin Index
- Hot Mail Downloader
- Grasp De Bello
- Throwing Exceptions
- Visua Basic Profiler
- Latin Dictionary D
- Rag Bag
- Basic To Ctranslator
- Missing Pages
- Hard Little Words
- Wiki Posterous
- Open Office
- Other Links
- Live Maths
- Wiki Blogger
- Live Maths Two
- home
- Stem Notes
- Live Maths Code Generation
- Learning Methods
- Peer To Peer Networks
- Auto Menu Old
- Peer To Peer Networks
- Tiny Cad
- Using Bluej
- WikiIndex
- WikiIndex
- WikiIndex
- Curl
- Big body
- Curl
- Big body
- Curl test
- test..etc
- test..etc
- test..etc
- test..etc
- test..etc
- Euclid's Elements Book I
- Latin Dictionary P
- Latin Dictionary O
- Latin Participles
- Latin Adler
- Complete Idiots Guide Latin
- Latin Glossary
- Jamendo torrents and alternative trackers
- WikiIndex
- Latin Dictionary S
- Latin Dictionary I
- VB.Net attributes, useful pages
- Latin Dictionary M
- Can I attach binaries?
- Latin Dictionary L
- Live Maths Jan 2008
- Grasp Custodes
- Atheism
- Dot Plot Version Three
- nonwiki/wikiblogger/wikiblogger.py
- Proposed Programs
- My Software
- Sharp Threads
- Csharp Notes
- Soft Ware
- Latin Dictionary B
- Latin Dictionary a
- Par T 1
- Latin Dictionary Q
- Regular Expressions
- Latin Dictionary E
- Latin Dictionary C
- Picture Box
- Gnu Plot
- Latin Dictionary G
- Csharp Layout
- Current Version
- Latin Dictionary D
- Csharp Links
- Unix Text
-
▼
March
(188)
No comments:
Post a Comment