Projects
I don't have a ton of extra time to be spending on projects lately, but if I do anything I'll be sure to put it up here
Redline Routes
This is a website I've created for user-submitted ideal motorcycle routes. It started as just messing around with the Google Maps API, then turned into the site because I couldn't find a decent site with Canadian roads, only US roads.
Bulk HTML Tidy and XHTML 1.0 Strict compliance
bulkTidy.py - Click for source code
# Usage: from command line, run following command - dirWalker.py /path/to/scan/
# The path to scan is optional. If no path is given, current working directory will be used.
# A path to the batch file must be given. This creates a batch tidy script for all htm files in the directory/directories given
# This will automatically run the batch script and some regular expressions to clean the documents.
from __future__ import generators # needed for Python 2.2
from datetime import datetime # Not really necessary, but used for script timer (for benchmarking)
import sys, os, stat, re, thread
#For files that cannot be opened
cantOpen = 0 #global variable
def walktree(top = ".", depthfirst = True):
"""Walk the directory tree, starting from top. Credit to Noah Spurrier and Doug Fort."""
import os, stat, types
names = os.listdir(top)
if not depthfirst:
yield top, names
for name in names:
try:
st = os.lstat(os.path.join(top, name))
except os.error:
continue
if stat.S_ISDIR(st.st_mode):
for (newtop, children) in walktree (os.path.join(top, name), depthfirst):
yield newtop, children
if depthfirst:
yield top, names
#/////////////////////////////////////////////////
# Start A Real Validator creation
#/////////////////////////////////////////////////
def findArvFiles(top, depthfirst=False):
from xml.sax.saxutils import escape # To quote out things like &
ret = ['\n']
for top, names in walktree(top):
if top.find('_baks') < 0 and top.find('_vti') < 0 and top.find('_notes') < 0 and top.find('_cnf') < 0:
# Prints out the directories found -- testing for above condition
#print top + "<br />\n"
for name in names:
if name.endswith('.htm') or name.endswith('.HTM'):
cygDrivePath = top
windowsDrivePath = cygDrivePath.replace("/cygdrive/","")
# Put into Windows file format for running batch file by double clicking in Windows Explorer
windowsDrivePath = windowsDrivePath.replace("/",":\\",1)
windowsDrivePath = windowsDrivePath.replace("/","\\")
# the last slash should be dependant on whether windowsDrivePath or cygDrivePath
if windowsDrivePath.endswith('\\'):
ret.append(windowsDrivePath + name + "\n")
else:
ret.append(windowsDrivePath + "\\" + name + "\n")
return ''.join(ret) # Much faster than += method
def makeArvFile(top, depthfirst=False):
if len(sys.argv) < 2 or len(sys.argv) > 3: # the program name and the one arguments
# stop the program and print an error message
sys.exit("Invalid number of options.\nEx: dirWalker.py /path/to/scan/(optional) /path/to/arv/file/\nIf no path to scan folder is specified, current path is used")
filePath = '/cygdrive/c/'
openFile = open(filePath + 'toClean.arv','w+')
openFile.write('\n'.join([findArvFiles(top, depthfirst)]))
openFile.close()
os.chmod(filePath + '/toClean.arv',stat.S_IRWXO + stat.S_IRWXG + stat.S_IRWXU)
print "File permissions on ARV file changed successfully\n"
print "A Real Validator file successfully created at " + filePath + "toClean.arv\n"
#/////////////////////////////////////////////////
# End A Real Validator creation
#/////////////////////////////////////////////////
#/////////////////////////////////////////////////
# Start HTML file creation (for QA purposes)
#/////////////////////////////////////////////////
def findHtmlFiles(top, depthfirst=False):
from xml.sax.saxutils import escape # To quote out things like &
ret = ['\n']
for top, names in walktree(top):
if top.find('_baks') < 0 and top.find('_vti') < 0 and top.find('_notes') < 0 and top.find('_cnf') < 0:
# Prints out the directories found -- testing for above condition
#print top + "<br />\n"
for name in names:
if name.endswith('.htm') or name.endswith('.HTM'):
cygDrivePath = top
# Put into Windows file format for running batch file by double clicking in Windows Explorer
windowsDrivePath = cygDrivePath.replace("/cygdrive/","")
#windowsDrivePath = windowsDrivePath.replace("/",":\\",1)
#windowsDrivePath = windowsDrivePath.replace("/","\\")
# the last slash should be dependant on whether windowsDrivePath or cygDrivePath
if cygDrivePath.endswith('/'):
html = cygDrivePath + name
else:
html = cygDrivePath + "/" + name
if html.find("/cygdrive/z/") >= 0:
html = html.replace("/cygdrive/z/","http://clf2dev/")
elif html.find("/cygdrive/e/") >= 0:
html = html.replace("/cygdrive/e/","http://preweb/")
if windowsDrivePath.endswith('\\'):
ret.append("<a href=\"" + html + "\">" + html + "</a><br />\n")
else:
ret.append("<a href=\"" + html + "\">" + html + "</a><br />\n")
return ''.join(ret) # Much faster than += method
def makeHtmlFile(top, depthfirst=False):
if len(sys.argv) < 2 or len(sys.argv) > 3: # the program name and the one arguments
# stop the program and print an error message
sys.exit("Invalid number of options.\nEx: dirWalker.py /path/to/scan/(optional) /path/to/arv/file/\nIf no path to scan folder is specified, current path is used")
filePath = '/cygdrive/c/'
openFile = open(filePath + 'toQA.html','w+')
openFile.write(''.join([findHtmlFiles(top, depthfirst)]))
openFile.close()
os.chmod(filePath + '/toQA.html',stat.S_IRWXO + stat.S_IRWXG + stat.S_IRWXU)
print "File permissions on HTML file changed successfully\n"
print "HTML QA file successfully created at " + filePath + "toQA.html\n"
#/////////////////////////////////////////////////
# End HTML file creation
#/////////////////////////////////////////////////
#/////////////////////////////////////////////////
# Start Tidy batch file creation
#/////////////////////////////////////////////////
def searchTidyBatch(top, depthfirst=False):
tidyString = """tidy -i -w -q -m -asxhtml --quote-marks yes --drop-empty-paras yes --indent yes --indent-spaces 3 --wrap 0 --doctype strict --char-encoding latin1 --logical-emphasis yes --drop-font-tags yes --join-classes no --merge-divs no --word-2000 yes """
from xml.sax.saxutils import escape # To quote out things like &
ret = [':: Tidy batch file creation command line tool, written by Andrew Hinde for the CRTC\n']
for top, names in walktree(top):
if top.find('_baks') < 0 and top.find('_vti') < 0 and top.find('_notes') < 0 and top.find('_cnf') < 0:
# Prints out the directories found -- testing for above condition
#print top + "<br />\n"
for name in names:
if name.endswith('.htm') or name.endswith('.HTM'):
cygDrivePath = top
windowsDrivePath = cygDrivePath.replace("/cygdrive/","")
# Put into Windows file format for running batch file by double clicking in Windows Explorer
windowsDrivePath = windowsDrivePath.replace("/",":\\",1)
windowsDrivePath = windowsDrivePath.replace("/","\\")
# the last slash should be dependant on whether windowsDrivePath or cygDrivePath
if windowsDrivePath.endswith('\\'):
ret.append(tidyString + "\"" + windowsDrivePath + name + "\"\n")
else:
ret.append(tidyString + "\"" + windowsDrivePath + "\\" + name + "\"\n")
return ''.join(ret) # Much faster than += method
def writeTidyBatch(top, depthfirst=False):
if len(sys.argv) < 1 or len(sys.argv) > 2: # the program name and the one arguments
# stop the program and print an error message
sys.exit("Invalid number of options.\nEx: dirWalker.py /path/to/scan/(optional) \nIf no path to scan folder is specified, current path is used")
filePath = '/cygdrive/c/'
openFile = open(filePath + 'toTidy.bat','w+')
print "File created successfully : " + filePath + "toTidy.bat\n"
openFile.write('\n'.join([searchTidyBatch(top, depthfirst)]))
print "File written successfully\n"
openFile.close()
os.chmod(filePath + '/toTidy.bat',stat.S_IRWXO + stat.S_IRWXG + stat.S_IRWXU)
print "File permissions changed successfully\n"
print "Tidy batch file successfully created.\n"
#/////////////////////////////////////////////////
# End Tidy batch file creation
#/////////////////////////////////////////////////
#/////////////////////////////////////////////////
# Start extra cleaning of documents
#/////////////////////////////////////////////////
def cleanDocs(top, depthfirst=False):
from xml.sax.saxutils import escape # To quote out things like &
ret = ['']
global cantOpen
for top, names in walktree(top):
if top.find('_baks') < 0 and top.find('_vti') < 0 and top.find('_notes') < 0 and top.find('_cnf') < 0:
for name in names:
if name.endswith('.htm') or name.endswith('.HTM'):
cygDrivePath = top
windowsDrivePath = cygDrivePath.replace("/cygdrive/","")
# Put into Windows file format for running batch file by double clicking in Windows Explorer
windowsDrivePath = windowsDrivePath.replace("/",":\\",1)
windowsDrivePath = windowsDrivePath.replace("/","\\")
# the last slash should be dependant on whether windowsDrivePath or cygDrivePath
try:
if windowsDrivePath.endswith('\\'):
ioFile = open(windowsDrivePath + name, "r+")
else:
ioFile = open(windowsDrivePath + "\\" + name, "r+")
except IOError:
# Usually the case if the file is set at Read Only
# Future upgrade: chmod the file to allow for writing
print "Cannot open file!"
cantOpen += 1
continue
except OSError:
print "File not found!"
continue
else:
fileContents = ioFile.read()
ioFile.close()
#width="" and height="" regex
reWidthHeight = r"(?i)(width|height)=\"[\d]+\%?\""
fileContents = re.sub(reWidthHeight,"",fileContents)
#MS if/end if statements regex
reMsIfTags = r"(?i)\<\!(--)?\[(end)?if[^\>]+>"
fileContents = re.sub(reMsIfTags,"",fileContents)
#Underline tags regex
reUnderline = r"(?i)<[\/]?u>"
fileContents = re.sub(reUnderline,"",fileContents)
#Bordercolor and bgcolor removal regex
reBordercolor = r"(?i)(bgcolor|bordercolor)=\"\#[a-fA-F0-9]{3,6}\""
fileContents = re.sub(reBordercolor,"",fileContents)
#Ordered list "start" attributes
reStartAttr = r"(?i) start=\"[^\"]+\""
fileContents = re.sub(reStartAttr,"",fileContents)
# Language="Javascript" attribute on <script> tags
reLangAttr = r"(?i) language=(\")?javascript(\")?"
fileContents = re.sub(reLangAttr,"",fileContents)
#MS tags with semicolon in them regex
reMStags = r"(?i)<[\/]?[a-zA-Z0-9]+:[^>]+>"
fileContents = re.sub(reMStags,"",fileContents)
# Non-regex search and replaces
fileContents = fileContents.replace("src=\"/clf20/images/tphp.gif\" alt=\"\"","src=\"/clf20/images/tphp.gif\" width=\"19\" height=\"12\" alt=\"\"")
fileContents = fileContents.replace(" style=\"margin-left: 2em\""," class=\"indent1\"")
# Apparently getting rid of inline CSS messes up EVERYTHING, then people complain
# so I took it out of the script to keep everyone happy, and documents looking like they did
# before, although still terribly coded
# Style attributes
#reStyle = r"(?i) style=\"[^\"]+\""
#fileContents = re.sub(reStyle,"",fileContents)
# For all ../ before the clf20 folder (CSS, images, etc)
reDotDotSlash = r"(?i)(\.\.\/)+clf20\/"
fileContents = re.sub(reDotDotSlash,"/clf20/",fileContents)
# For all XML declarations at the beginning of the page
XML = "<?xml version=\"1.0\" encoding=\"iso-8859-1\"?>"
fileContents = fileContents.replace(XML,"")
if windowsDrivePath.endswith('\\'):
updatedFile = open(windowsDrivePath + name, "w+")
else:
updatedFile = open(windowsDrivePath + "\\" + name, "w+")
updatedFile.write(fileContents)
updatedFile.close()
print "Finished extra cleanup on " + name
#/////////////////////////////////////////////////
# End extra cleaning of documents
#/////////////////////////////////////////////////
def runTidyBatch():
os.system("/cygdrive/c/toTidy.bat")
def fileCount(top, depthfirst=False):
htmCount = 0
for top, names in walktree(top):
if top.find('_baks') < 0 and top.find('_vti') < 0 and top.find('_notes') < 0 and top.find('_cnf') < 0:
for name in names:
if name.endswith('.htm') or name.endswith('.HTM'):
htmCount += 1
return htmCount
if __name__ == '__main__':
#the 2nd argument is the path of the folder to be searched, but only if there are 3 arguments in the command
if len(sys.argv) == 2:
top = sys.argv[1]
else: top = '.'
startTime = datetime.now()
# Start multithreading for quicker runtimes
intFileCount = thread.start_new_thread(fileCount,(top,))
thread.start_new_thread(makeArvFile,(top,))
thread.start_new_thread(makeHtmlFile,(top,))
writeTidyBatch(top)
runTidyBatch()
os.system("clear")
cleanDocs(top)
endTime = datetime.now()
timeDiff = endTime - startTime
print "Total time taken : " + str(timeDiff) + " on " + str(fileCount(top)) + " files"
if cantOpen > 0:
print str(cantOpen) + " files could not be opened to run regular expressions on"
print "\n"
At work, I had the need for an HTML Tidy script that did a little extra, and that could traverse through directories recursively, so I decided to take the opportunity to learn Python as well.
This script will take a given directory (or current directory if none is given) and list all .htm files (we don't use html files at work), then it will create a batch file to run HTML Tidy, as well as a file to check validation using A Real Validator, and an HTML file to easily do a QA on larger batches. It then goes through a list of about 10 other regular expressions to clean up extra code that Tidy doesn't take care of.
So far, benchmarks show that it can do roughly 5000 pages per hour. Obviously larger pages with more validation issues will cause it to run longer.
This script is very customized towards Government of Canada Common Look and Feel 2, and it was also written using Cygwin Bash Shell for Windows with Python installed.