34 lines
1019 B
Python
34 lines
1019 B
Python
# Generates urlencoded.txt from utf-8.txt
|
|
#
|
|
# urlencoded.txt is used by Tests_Formatting_Utf8UriEncode
|
|
|
|
import urllib, codecs, re
|
|
import sys
|
|
|
|
# uncapitalize pct-encoded values, leave the rest alone
|
|
capfix = re.compile("%([0-9A-Z]{2})");
|
|
def fix(match):
|
|
octet = match.group(1)
|
|
intval = int(octet, 16)
|
|
if intval < 128:
|
|
return chr(intval).lower()
|
|
return '%' + octet.lower()
|
|
|
|
def urlencode(line):
|
|
"""Percent-encode each byte of non-ASCII unicode characters."""
|
|
line = urllib.quote(line.strip().encode("utf-8"))
|
|
line = capfix.sub(fix, line)
|
|
return line
|
|
|
|
if __name__ == "__main__":
|
|
args = sys.argv[1:]
|
|
if args and args[0] in ("-h", "--help"):
|
|
print "Usage: python urlencode.py < utf-8.txt > urlencoded.txt"
|
|
sys.exit(2)
|
|
|
|
sys.stdin = codecs.getreader("utf-8")(sys.stdin)
|
|
sys.stdout = codecs.getwriter("ascii")(sys.stdout)
|
|
|
|
lines = sys.stdin.readlines()
|
|
sys.stdout.write( "\n".join(map(urlencode, lines)) )
|