Code:
#!/usr/bin/perl
print "Content-Type: text/plain\n\n"
#===========================
#Besmellah Alrahman Alraheem
#===========================
#
################################################
# #
# Flaraby 2 #
# ----------- #
# Convert Unicode string/file #
# to Unicode Arabic string readable #
# in Flash dynamic TextField #
# ------------------------------------------ #
# Copyright 2006 newhive All Rights Reserved #
# http://www.newhive.com #
# Author: Ahmad Adel ElDardiry #
# ahmad@newhive.com #
# #
################################################
import cgi
import codecs
import re
#=========================================================
# Data
#=========================================================
ar=[u'\u0627',u'\u0623',u'\u0625',u'\u0622',u'\u0628',u'\u062A',u'\u0629',u'\u062B',u'\u062C',u'\u062D',u'\u062E',u'\u062F',u'\u0630',u'\u0631',u'\u0632',u'\u0633',u'\u0634',u'\u0635',u'\u0636',u'\u0637',u'\u0638',u'\u0639',u'\u063A',u'\u0641',u'\u0642',u'\u0643',u'\u0644',u'\u0645',u'\u0646',u'\u0647',u'\u0648',u'\u0624',u'\u064A',u'\u0649',u'\u0626',u'\u0621',u'\u0640',u'\u064B',u'\u064C',u'\u064D',u'\u064E',u'\u064F',u'\u0650',u'\u0651',u'\u0652',u'\uFEFB',u'\uFEF9',u'\uFEF7',u'\uFEF5']
arOrd = [
[65165,65166,65165,65166],
[65155,65156,65155,65156],
[65159,65160,65159,65160],
[65153,65154,65153,65154],
[65167,65168,65169,65170],
[65173,65174,65175,65176],
[65171,65172,65171,65172],
[65177,65178,65179,65180],
[65181,65182,65183,65184],
[65185,65186,65187,65188],
[65189,65190,65191,65192],
[65193,65194,65193,65194],
[65195,65196,65195,65196],
[65197,65198,65197,65198],
[65199,65200,65199,65200],
[65201,65202,65203,65204],
[65205,65206,65207,65208],
[65209,65210,65211,65212],
[65213,65214,65215,65216],
[65217,65218,65219,65220],
[65221,65222,65223,65224],
[65225,65226,65227,65228],
[65229,65230,65231,65232],
[65233,65234,65235,65236],
[65237,65238,65239,65240],
[65241,65242,65243,65244],
[65245,65246,65247,65248],
[65249,65250,65251,65252],
[65253,65254,65255,65256],
[65257,65258,65259,65260],
[65261,65262,65261,65262],
[65157,65158,65157,65158],
[65265,65266,65267,65268],
[65263,65264,65263,65264],
[65161,65162,65163,65164],
[65152,65152,65152,65152],
[1600,1600,1600,1600],
[1611,1611,1611,1611],
[1612,1612,1612,1612],
[1613,1613,1613,1613],
[1614,1614,1614,1614],
[1615,1615,1615,1615],
[1616,1616,1616,1616],
[1617,1617,1617,1617],
[1618,1618,1618,1618],
[65275,65276,65275,65276],
[65273,65274,65273,65274],
[65271,65272,65271,65272],
[65269,65270,65269,65270],
]
en = [u'\u0041',u'\u0042',u'\u0043',u'\u0044',u'\u0045',u'\u0046',u'\u0047',u'\u0048',u'\u0049',u'\u004A',u'\u004B',u'\u004C',u'\u004D',u'\u004E',u'\u004F',u'\u0050',u'\u0051',u'\u0052',u'\u0053',u'\u0054',u'\u0055',u'\u0056',u'\u0057',u'\u0058',u'\u0059',u'\u005A',u'\u0061',u'\u0062',u'\u0063',u'\u0064',u'\u0065',u'\u0066',u'\u0067',u'\u0068',u'\u0069',u'\u006A',u'\u006B',u'\u006C',u'\u006D',u'\u006E',u'\u006F',u'\u0070',u'\u0071',u'\u0072',u'\u0073',u'\u0074',u'\u0075',u'\u0076',u'\u0077',u'\u0078',u'\u0079',u'\u007A']
enOrd = [65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122]
puncAr = [u'\u060C',u'\u061F',u'\u061B',u'\n',u'\u002E',u'\u002C',u'\u0028',u'\u0029',u'\u0022',u'\u0027',u'\u005B',u'\u005D',u'\u007B',u'\u007D',u'\u003F',u'\u003A',u'\u002F',u'\u002B',u'\u002D',u'\u003D',u'\u005F',u'\u003E',u'\u003C',u'\u0021',u'\u0040',u'\u0023',u'\u0024',u'\u0025',u'\u005E',u'\u0026',u'\u002A',u'\u007C',u'\u00BB',u'\u00AB',u'\u007E',u'\u00A9',u'\u00AE',u'\u00B0']
puncArOrd = [1548,1563,1567,10,46,44,40,41,34,39,93,91,125,123,63,58,47,43,45,61,95,62,60,33,64,35,36,37,94,38,42,124,187,171,126,169,174,176]
digits = [u'0', u'1', u'2', u'3', u'4', u'5', u'6', u'7', u'8', u'9']
digitsOrd = [1632,1633,1634,1635,1636,1637,1638,1639,1640,1641]
sep = [u'\u0627',u'\u0623',u'\u0625',u'\u0622',u'\u0631',u'\u0632',u'\u0648',u'\u0624',u'\u062F',u'\u0630',u'\u0621',u'\uFEFB',u'\uFEF9',u'\uFEF7',u'\uFEF5']
tashkeel = [u'\u064B',u'\u064C',u'\u064D',u'\u064E',u'\u064F',u'\u0650',u'\u0651',u'\u0652']
tagsArr = {
'img':[1,'(<img [^>]*>)',[]],
'lnk':[2,'(<a [^>]*>)(.*?)(<\/a>)',[]],
'clr':[2,'(<font [^>]*>)(.*?)(<\/font>)',[]],
'udr':[2,'(<u>)(.*?)(<\/u>)',[]]
}
currentTag = u''
matchIndex = 0
#=========================================================
# Functions
#=========================================================
def init():
inputVars = cgi.FieldStorage()
typ = inputVars['type'].value
inputStr = inputVars['str'].value
inputStr = inputStr.decode('utf-8')
isfile = inputVars['isFile'].value
if isfile == 'true':
inputStr = readFile(inputStr)
outputStr = processStr(inputStr,typ)
return outputStr
def readFile(file):
f = codecs.open(file,'r','utf-8')
str = f.read()
str = str[1:]
f.close()
return str
def processStr(str,typ):
str = replaceNewlines(str)
str = replaceDoubleChars(str)
if(typ == 'text'):
str = convertArabicStr(str)
return str
else:
str = str.replace(u'<br>',u' <br> ')
str = replaceTags(str)
str = joinBetweenTags(str)
str = convertArabicStr(str)
str = returnTags(str)
str = splitBetweenTags(str)
str = str.replace(u'>;;;<',u'><')
str = str.replace(u';;;;;;',u';;;')
return str
def replaceNewlines(str):
# newlines
str = str.replace(u'\r\n',u' \n ')
str = str.replace(u'\r',u' \n ')
return str
def replaceDoubleChars(str):
# LA LEA LAA LAAA
str = str.replace(u'\u0644\u0627',u'\uFEFB')
str = str.replace(u'\u0644\u0625',u'\uFEF9')
str = str.replace(u'\u0644\u0623',u'\uFEF7')
str = str.replace(u'\u0644\u0622',u'\uFEF5')
return str
#---------------------------------------------
def replaceTags(str):
for tag in tagsArr:
#Assignments (write) should use globals()['variablename']
globals()['currentTag'] = tag
globals()['matchIndex'] = 0
pat = tagsArr[globals()['currentTag']][1]
regx = re.compile(pat,re.UNICODE)
str = regx.sub(repFunc,str)
return str
def repFunc(matchObj):
#Read only doesn't need globals()['']
cnt = globals()['currentTag']
if(tagsArr[cnt][0] == 1):
tagsArr[cnt][2].append(matchObj.group(1))
#Add a string to be replaced later after processing
#S == Start Tag
#Convert Integer to String to concatenate
mindex = '%s' % globals()['matchIndex']
rep = u' |^|'+cnt+'-'+mindex+'-S|^| '
globals()['matchIndex'] = globals()['matchIndex']+1
return rep
else:
tagsArr[cnt][2].append(matchObj.group(3))
tagsArr[cnt][2].append(matchObj.group(1))
#S == Start Tag E == EndTag
mindex = '%s' % globals()['matchIndex']
rep = u' |^|'+cnt+'-'+mindex+'-S|^| '
globals()['matchIndex'] = globals()['matchIndex']+1
#Add the "in between tags" string
rep = rep+matchObj.group(2)
mindex = u'%s' % globals()['matchIndex']
rep = rep+u' |^|'+cnt+'-'+mindex+'-E|^| '
globals()['matchIndex'] = globals()['matchIndex']+1
return rep
def joinBetweenTags(str):
pat = '(\|\^\|([\w]+)-[\d]+-S\|\^\|[ ]+)([^\|\^<>]+)([ ]+\|\^\|(\\2)-[\d]+-E\|\^\|)'
regx = re.compile(pat,re.UNICODE)
str = regx.sub(repBetweenFunc,str)
return str
def repBetweenFunc(matchObj):
sta = matchObj.group(1)
end = matchObj.group(4)
btw = matchObj.group(3)
btw = btw.split(u' ')
btw = u'_'.join(btw)
rep = sta+u'|||||'+btw+u'|||||'+end
return rep
def returnTags(str):
pat = ';;;(\|\^\|)([\w]+)-([\d]+)-([SE])(\|\^\|);;;'
regx = re.compile(pat,re.UNICODE)
str = regx.sub(retFunc,str)
return str
def retFunc(matchObj):
tag = matchObj.group(2)
index = int(matchObj.group(3))
typ = matchObj.group(4)
rep = u';;;'+tagsArr[tag][2][index]+u';;;'
return rep
def splitBetweenTags(str):
arr = str.split(u'|||||')
i = 0
while i<len(arr):
if (i%2) != 0 :
arr[i] = handleBTW(arr[i])
i = i+1
return u''.join(arr)
def handleBTW(str):
resArr = []
tmp = []
arr = str.split(u'_')
j = 0
while j<len(arr):
if isArabic(arr[j]):
tmp.append(convertArabicStr(arr[j]))
else:
resArr = resArr[:]+tmp[:]+[arr[j]]
tmp = []
j = j+1
resArr = resArr[:]+tmp[:]
str = u';;;'.join(resArr)
return str
def isArabic(word):
c = 0
while c<len(word):
if ord(word[c]) > 255 :
return True
c = c+1
return False
#---------------------------------------------
def convertArabicStr(str):
arr = str.split(u' ')
i = 0
while i<len(arr):
word = arr[i]
#
# Allah WAllah FAllah Bellah TAllah Lellah
if word == u'\u0627\u0644\u0644\u0647':
arr[i] = unichr(65010)+unichr(65165)
elif word == u'\u0648\u0627\u0644\u0644\u0647':
arr[i] = unichr(65010)+unichr(65165)+unichr(65261)
elif word == u'\u0641\u0627\u0644\u0644\u0647':
arr[i] = unichr(65010)+unichr(65165)+unichr(65235)
elif word == u'\u0628\u0627\u0644\u0644\u0647':
arr[i] = unichr(65010)+unichr(65165)+unichr(65169)
elif word == u'\u062a8\u0627\u0644\u0644\u0647':
arr[i] = unichr(65010)+unichr(65165)+unichr(65175)
elif word == u'\u0644\u0644\u0647':
arr[i] = unichr(65010)
elif (word[0:3] == u'|^|' or word[0:5] == u'|||||' or word == u'<br>'):
arr[i] = word
else:
arr[i] = analyzeWord(word)
i = i+1
#Join with some characters other than space to avoid splitting tags
return u';;;'.join(arr)
def analyzeWord(word):
isArabic = True
hasArabic = False
p = 0
while p<len(word):
if word[p] in ar:
hasArabic = True
else:
isArabic = False
p = p+1
if isArabic:
#Pure Arabic
return convertArabicWord(word)
elif hasArabic:
#has a mix of Arabic characters/English characters/ punctuations/digits
tmpWord = u''
obj = getMixedWordTypes(word)
j = 0
while j<len(obj['ids']):
first = obj['ids'][j]
if (j+1)<len(obj['ids']):
last = obj['ids'][j+1]
else:
last = len(word)
tmpStr = word[first:last]
tp = obj['types'][j]
if (tp == 'arabic'):
tmpStr = convertArabicWord(tmpStr)
elif (tp == 'digits'):
tmpStr = getOrd(tmpStr,digits,digitsOrd)
else:
#English or Punctuations
tmpStr = tmpStr
tmpWord = tmpStr+tmpWord
j = j+1
return tmpWord
elif word.isdigit():
#Pure Number (treat digits as Arabic digits)
return getOrd(word,digits,digitsOrd)
else:
#has a mix of English characters/ Punctuations/ digits
return word
def getOrd (word, arr1, arr2):
str = u''
p = 0
while p < len(word) :
pos = arr1.index(word[p])
str = str + unichr(arr2[pos])
p = p+1
return str
def getMixedWordTypes(word):
ids = []
types = []
typ = ''
j = 0
while j<len(word):
char = word[j]
if char in ar:
if typ != 'arabic':
typ = 'arabic'
types.append(typ)
ids.append(j)
elif char in digits:
if typ != 'digits':
typ = 'digits'
types.append(typ)
ids.append(j)
else:
if typ != 'strange':
typ = 'strange'
types.append(typ)
ids.append(j)
j = j+1
return {'ids':ids, 'types':types}
def convertArabicWord(word):
str = u''
p = 0
while p<len(word):
archar = convertArabicChar(p, word)
str = archar+str
p = p+1
return str
def convertArabicChar(p, word):
#
char = word[p]
#
#Arabic check
#forms:
# isolated - final - initial - medial
if (p == 0):
#first character (initial)
pos = ar.index(char)
if (len(word)>1) :
uchar = unichr(arOrd[pos][2])
else:
#single character (same as first or initial)
uchar = unichr(arOrd[pos][0])
elif (p == len(word)-1):
#last character
if ((word[p-1] in sep) or (word[p-1] in tashkeel and word[p-2] in sep) or (word[p-2] in tashkeel and word[p-3] in sep)):
#last isolated (isolated)
pos = ar.index(char)
uchar = unichr(arOrd[pos][0])
else:
#last linked (final)
pos = ar.index(char)
uchar = unichr(arOrd[pos][1])
else:
#middle character
if((p+1 == len(word)-1 and word[p+1] in tashkeel) or (p+2 == len(word)-1 and word[p+1] in tashkeel and word[p+2] in tashkeel)):
#last character
if ((word[p-1] in sep) or (word[p-1] in tashkeel and word[p-2] in sep) or (word[p-2] in tashkeel and word[p-3] in sep)):
#last isolated (isolated)
pos = ar.index(char)
uchar = unichr(arOrd[pos][0])
else:
#last linked (final)
pos = ar.index(char)
uchar = unichr(arOrd[pos][1])
elif ((word[p-1] in sep) or (p>=2 and word[p-1] in tashkeel and word[p-2] in sep) or (p>=3 and word[p-2] in tashkeel and word[p-3] in sep)):
#middle isolated (same as first or initial)
pos = ar.index(char)
uchar = unichr(arOrd[pos][2])
else:
#middle linked (medial)
pos = ar.index(char)
uchar = unichr(arOrd[pos][3])
return uchar
#=========================================================
# Main
#=========================================================
outputStr = init()
outputVars = "&outputStr="+outputStr+"&"
outputVars = outputVars.encode('utf-8')
print outputVars
#=========================================================
#----------------------------
#Alhamdo Lellah Rabelalameen
Social Networking Bookmarks