Manipulating dict of dict as trees with :
creating a dict from a path
>>> tree_from_path( 'a', 'b', 'c', 1, defaultdict_factory = int ).tprint()
{
a : {
b : {
c : 1,
},
},
}
convert from any other nested object to a VectorDict espaecially usefull for constructing a vector dict from intricated dict of dicts. Dont work as a class method (why ?)
>>> convert_tree({ 'a' : { 'a' : { 'r' : "yop", 'b' : { 'c' : 1 }}}}).tprint()
{
a : {
a : {
r : 'yop',
b : {
c : 1,
},
},
},
}
** BUG : if empty dict is a leaf, default_factory is not applied** workaround :you can specify Krut as leaves explicilty let’s define 3 domain :
- root is defaulting to str
- a is defaulting to list
- b defaulting to int
>>> from vector_dict.VectorDict import VectorDict as krut
>>> from vector_dict.VectorDict import converter as kruter
>>> a = kruter( { 'a' : Krut( list, {} ),'b' : Krut(int, {}) }, str )
>>> a['b']['e'] += 1
>>> a['a']['c'] += [ 3 ]
>>> a['d'] += "toto"
>>> a.tprint()
{
'a' : {
'c' : [3],
},
'b' : {
'e' : 1,
},
'd' : 'toto',
}
slightly enhanced Dict
pretty printing with indentation in tradiotionnal fashion
pretty printing the VectorDict in flattened vectorish representation
pretty printing in a tree like form a la Perl
slightly enhanced Dict
Return all elements common in two different trees raise an exception if both leaves are different
#TOFIX : dont make a newdict if doing in place operations
>>> from vector_dict.VectorDict import convert_tree, VectorDict,Element,Path
>>> a = VectorDict( int, { 'a' : VectorDict( int, dict( b = 1, c = 2 ) ) } )
>>> b = VectorDict( int, { 'a' : VectorDict( int, dict( b = 1, d = 1 ) ) } )
>>> a.intersection(b).tprint()
{
a : {
b : 1,
},
}
>>> b = VectorDict( int, { 'a' : VectorDict( int, dict( b = 1, c = 1 ) ) } )
>>> a.intersection(b).tprint()
Traceback (most recent call last):
File "<input>", line 1, in <module>
File "vector_dict/VectorDict.py", line 634, in intersection
new_dict[k] = (self[k]).intersection( other[k] )
File "vector_dict/VectorDict.py", line 639, in intersection
other[k]
Exception: ('CollisionError', '2 != 1')
return elements present only elements in one of the dict Throw a Collision Error if two leaves in each tree are different
>>> from vector_dict.VectorDict import convert_tree, VectorDict,Element,Path
>>> a = VectorDict( int, { 'a' : VectorDict( int, dict( b = 1, d = 2, c=1 ) ), 'e' : 1 } )
>>> b = VectorDict( int, { 'a' : VectorDict( int, dict( b = 1, c = 1 ) ) } )
>>> a.symmetric_difference(b)
defaultdict(<type 'int'>, {'a': defaultdict(<type 'int'>, {'d': 2}), 'e': 1})
>>> b = VectorDict( int, { 'a' : VectorDict( int, dict( b = 1, c = 2 ) ) } )
>>> a.symmetric_difference(b)
Traceback (most recent call last):
File "<input>", line 1, in <module>
File "vector_dict/VectorDict.py", line 694, in symmetric_difference
for path,v in self.intersection(other).as_vector_iter():
File "vector_dict/VectorDict.py", line 658, in intersection
new_dict[k] = (self[k]).intersection( other[k] )
File "vector_dict/VectorDict.py", line 663, in intersection
other[k]
Exception: ('CollisionError', '1 != 2')
return the union of two dicts
>>> from vector_dict.VectorDict import convert_tree, VectorDict,Element,Path
>>> b = VectorDict( int, { 'a' : VectorDict( int, dict( b = 1, c = 1 ) ) } )
>>> a = VectorDict( int, { 'a' : VectorDict( int, dict( b = 1, d = 2, c=1 ) ), 'e' : 1 } )
>>> b.union(a)
defaultdict(<type 'int'>, {'a': defaultdict(<type 'int'>, {'c': 1, 'b': 1, 'd': 2}), 'e': 1})
>>> a = VectorDict( int, { 'a' : VectorDict( int, dict( b = 1, d = 2, c=3 ) ) } )
>>> b.union(a)
Traceback (most recent call last):
File "<input>", line 1, in <module>
File "vector_dict/VectorDict.py", line 669, in union
return self + other - self.intersection(other)
File "vector_dict/VectorDict.py", line 658, in intersection
new_dict[k] = (self[k]).intersection( other[k] )
File "vector_dict/VectorDict.py", line 663, in intersection
other[k]
Exception: ('CollisionError', '1 != 3')
tells if all element of self are included in other
>>> from vector_dict.VectorDict import convert_tree, VectorDict,Element,Path
>>> a = VectorDict( int, { 'a' : VectorDict( int, dict( b = 1, d = 2, c=1 ) ), 'e' : 1 } )
>>> b = VectorDict( int, { 'a' : VectorDict( int, dict( b = 1, c = 1 ) ) } )
>>> b.issubset(a)
True
>>> a.issubset(b)
False
tells if all element of other is included in self throws an exception if two leaves in the two trees have different values
>>> from vector_dict.VectorDict import convert_tree, VectorDict,Element,Path
>>> a = VectorDict( int, { 'a' : VectorDict( int, dict( b = 1, d = 2, c=1 ) ), 'e' : 1 } )
>>> b = VectorDict( int, { 'a' : VectorDict( int, dict( b = 1, c = 1 ) ) } )
>>> a.issuperset(b)
True
>>> b.issuperset(a)
False
slightly enhanced Dict
>>> a = convert_tree({ 'a' : { 'a' : { 'r' : "yop" , 'b' : { 'c' : 1 }}}})
>>> a.tprint()
{
a = {
a = {
r = 'yop',
b = {
c = 1,
},
},
},
}
>>> [ e for e in a.as_vector_iter() ]
[(('a', 'a', 'r'), 'yop'), (('a', 'a', 'b', 'c'), 1)]
>>> a = convert_tree({ 'a' : { 'a' : { 'r' : "yop" , 'b' : { 'c' : 1 }}}})
>>> a.tprint()
{
a : {
a : {
r : 'yop',
b : {
c : 1,
},
},
},
}
>>> [ e for e in a.as_row_iter() ]
[['a', 'a', 'r', 'yop'], ['a', 'a', 'b', 'c', 1]]
slightly enhanced Dict
>>> intricated = convert_tree( { 'a' : { 'a' : { 'b' : { 'c' : 1 } } } } )
>>> pt = intricated.at( ( 'a', 'a', 'b' ) )
>>> pt
defaultdict(<class 'vector_dict.VectorDict.VectorDict'>, {'c': 1})
>>> pt['c'] = 2
>>> intricated.tprint()
{
a : {
a : {
b : {
c : 2,
},
},
},
}
>>> intricated.at( ( 'a', 'a', 'b' ), lambda x : x * -2 )
defaultdict(<class 'vector_dict.VectorDict.VectorDict'>, {'c': -4})
>>> intricated.pprint()
a->a->b->c = -4
Get a copy of an element at the coordinates given by the path Throw a KeyError excpetion if the path does not led to an element
>>> from vector_dict.VectorDict import convert_tree, VectorDict
>>> intricated = convert_tree( { 'a' : { 'a' : { 'b' : { 'c' : 1 } } } } )
>>> intricated.get_at( 'a', 'a', 'b' )
defaultdict(<class 'vector_dict.VectorDict.VectorDict'>, {'c': 1})
>>> intricated.get_at( 'a', 'a', 'b', 'c' )
1
>>> intricated.get_at( 'oops' )
Traceback (most recent call last):
File "<input>", line 1, in <module>
File "vector_dict/VectorDict.py", line 304, in get_at
return self.at( path, None , True)
File "vector_dict/VectorDict.py", line 330, in at
value = here[path[ -1 ] ]
KeyError: 'oops'
delete all items at path
>>> a = VectorDict(int, {})
>>> a.build_path( 'g', 'todel' , True )
>>> a.build_path( 'g', 'tokeep' , True )
>>> a.tprint()
{
g : {
tokeep : True,
todel : True,
},
}
>>> a.prune( 'g', 'todel' )
>>> a.tprint()
{
g : {
tokeep : True,
},
}
apply a fonction on value if predicate on key is found
implementation of constructing a path in a tree, argument is a serie of key
>>> a = VectorDict(int, {})
>>> a.build_path( 'k', 1 )
>>> a.tprint()
{
k = 1,
}
>>> a.build_path( 'b', 'n', [ 1 ] )
>>> a.build_path( 'd', 'e', 2 )
>>> a.build_path( 'd', 'f', 4 )
>>> a.tprint()
{
k : 1,
b : {
n : [1],
},
d : {
e : 2,
f : 4,
},
}
slightly enhanced Dict
norm of a vector dict = sqrt( a . a )
scalar = sum items self * other for each distinct key in common norm of the projection of self on other
jaccard similariry of two vectors dicts a . b / ( ||a||^2 + ||b||^2 - a . b )
cosine similarity of two vector dicts a . b / ( ||a||*||b|| )
>>> cos(
VectorDict( int, dict( x=1 , y=1) ),
VectorDict( int, dict( x=1 , y=0) ),
)
0.7071067811865475
>>> dot(
VectorDict( int, dict( x=1 , y=1, z=0) ),
VectorDict( int, dict( x=1 , y=1, z=1) ),
)
2.0
from vector_dict.VectorDict import iter_object, VectorDict
from csv import reader, writer
import os
from io import StringIO
from numpy import array
### CSV of langage users in a nosql DB (or any cloudish crap)
### having individual votes of cooolness per langage
### should be a very large dataset to be funnier
mocking_nosql = u"""george,FR,fun,perl,2
roger,FR,serious,python,3
christine,DE,fun,python,3
bob,US,serious,php,1
isabelle,FR,fun,perl,10
Kallista,FR,unfun,c#,-10
Nellev,FR,typorigid,python,1
haypo,FR,javabien,python,1
potrou,FR,globally locked,python,1
petra,DE,sexy,VHDL,69"""
nosql_iterator = reader
### interesting columns/key we want to emit
COUNTRY = 1
LANGAGE = 3
COOLNESS = 4
w = writer(os.sys.stdout)
## basic example for counting langage users nothing more interesting than
## what collections.counter does
print ""
w.writerow(["langage", "how many users"])
map(w.writerow,
iter_object(
reduce(
## well that is a complex reduce operation :)
VectorDict.__iadd__,
map(
lambda document: VectorDict(int, {document[LANGAGE]: 1}),
nosql_iterator(StringIO(mocking_nosql))
)
),
flatten = True
)
)
"""expected result :
langage,how many users
python,3
c#,1
php,1
VHDL,1
perl,2
"""
print "\n" * 2 + "next test\n"
### example with group by + aggregation of a counter
### counting all langage per country with their coolness and number of users
### Hum .... I miss a sort and a limit by to be completely SQL like compatible
w.writerow(["country", "langage", "coolness", "howmany"])
map(w.writerow,
iter_object(
##nosql like reduce
reduce(
## well that is the same very complex reduce operation :)
VectorDict.__iadd__,
##nosql like map where we emit interesting subset of the record
map(
lambda document: VectorDict(
VectorDict, {
#KEY
document[COUNTRY]:
VectorDict(
array,
{
#GROUPBY
document[LANGAGE]:
#AGGREGATOR
array([
#Counter
int(document[COOLNESS]),
#presence
1
])
}
),
## making a (sub) total on the fly
'total_coolness_and_voters': array([
int(document[COOLNESS]), 1
])
}
),
## nosql like filter that should be in the map if it was nosql
## maybe we also need a map that accepts None as a result and
## skip the result in the resulting iterator,
## or a skip() callable in lambda ?
## or sub() like in perl with curly braces
## joke : combining map / filter is easy enough
filter(
lambda document: "php" != document[LANGAGE],
nosql_iterator(StringIO(mocking_nosql))
)
)
),
flatten = True
)
)
"""expected result :
country,langage,coolness,howmany
FR,python,4,2
FR,c#,-10,1
FR,perl,12,2
total_votes_and_coolness,7,78
DE,python,3,1
DE,VHDL,69,1
"""
from vector_dict.VectorDict import VectorDict, convert_tree, is_leaf
from vector_dict.Clause import *
def findme( v, a_tree):
if not is_leaf(v):
return v.match_tree(a_tree)
positive = lambda v : v > 0
def pretty_present(list):
print "Result "
for el in list:
print "path %r " % el[0]
print "has value %s" % (hasattr(el[1], 'tformat') and el[1].tformat() or el[1] )
print
w = convert_tree( dict(
a = dict( c= 1),
e = dict( d= 3.0) ,
b = dict( c = 1 , d = 4 )
))
w.tprint()
pretty_present( w.find( lambda p, v : has_all( 'c', 'd' )(v) ) )
pretty_present( w.find( lambda p, v : findme(v, dict(
d= has_type(int),
c=has_type(int) )
) ) )
pretty_present( w.find( lambda p, v : has_tree({ 'a' : { 'c' : positive } })(v)) )
pretty_present( w.find( lambda p, v : (
has_type( int )(v) or has_type(float)(v)
) and v > 3 ))
pretty_present( w.find( lambda p, v : p.endswith( [ 'c' ] ) ) )
"""
RESULTS :
{
a = {
c = 1,
},
b = {
c = 1,
d = 4,
},
e = {
d = 3.0,
},
}
Result
path ['b']
has value {
c = 1,
d = 4,
}
Result
path ['b']
has value {
c = 1,
d = 4,
}
Result
path []
has value {
a = {
c = 1,
},
b = {
c = 1,
d = 4,
},
e = {
d = 3.0,
},
}
Result
path ['b', 'd']
has value 4
Result
path ['a', 'c']
has value 1
path ['b', 'c']
has value 1
"""
#!/usr/bin/env python
# -*- coding: utf-8 -*-
""" parallel wordcounting with VectorDict
Notice we can also get all first letter count in the same operations
unlike regular word count methods"""
## Making the input
# wget http://www.gutenberg.org/cache/epub/26740/pg26740.txt
# mv pg26740.txt dorian.txt
# split -l 2125 dorian.txt dorian_splited.
import os, sys, inspect
cmd_folder = os.path.abspath(
os.path.join(
os.path.split(
inspect.getfile( inspect.currentframe() )
)[0] ,
".."
)
)
if cmd_folder not in sys.path:
sys.path.insert(0, cmd_folder)
from multiprocessing import Pool
import string
import re
#from codecs import open as open
from vector_dict.VectorDict import VectorDict as vd
FILES = [ "dorian_splited.aa", "dorian_splited.ab",
"dorian_splited.ac", "dorian_splited.ad" ]
def word_count( unicode_file ):
exclude = set(string.punctuation)
def clean(exlcude):
def _clean(word):
return ''.join(ch for ch in word if ch not in exclude)
return _clean
sp_pattern = re.compile( """[\.\!\"\s\-\,\']+""", re.M)
res = vd( int, {})
for line in iter(open(unicode_file ) ):
for word in map( clean(exclude),
map( str.lower, sp_pattern.split(line ))
):
if len(word) > 2 :
res += vd(int, {
word : 1 ,
'begin_with' :
vd(int, { word[0] : 1 }) ,
'has_size' :
vd(int, { len(word) : 1 } )
})
return res
p = Pool()
result=p.map(word_count, FILES )
result = reduce(vd.__add__,result)
print "Frequency of words begining with"
result['begin_with'].tprint()
result.prune( "begin_with")
print "Repartition of size words size"
result['has_size'].tprint()
result.prune( "has_size")
from itertools import islice
print "TOP 40 most used words"
print "\n".join(
"%10s=%s" % (x, y) for x, y in sorted(result.items(), key=lambda x: x[1],reverse=True)[:40]
)
"""
EXPECTED RESULTS :
Frequency of words begining with
{
'\xc3' : 4,
'r' : 1426,
'a' : 5372,
'1' : 10,
'c' : 2778,
'b' : 2659,
'e' : 1420,
'd' : 2564,
'g' : 1534,
'f' : 2647,
'i' : 942,
'h' : 5867,
'k' : 483,
'j' : 245,
'm' : 2567,
'l' : 2706,
'o' : 1669,
'n' : 1416,
'q' : 199,
'p' : 2173,
's' : 5434,
'2' : 2,
'u' : 407,
't' : 9255,
'w' : 5491,
'v' : 507,
'y' : 2077,
'x' : 9,
'z' : 3,
}
TOP 40 most used words
the=3786
and=2216
you=1446
that=1362
was=1083
his=995
had=833
with=661
him=661
for=588
have=561
not=474
her=439
she=431
dorian=420
what=399
but=396
one=393
are=372
there=337
they=318
would=308
all=294
said=262
don=255
from=254
were=251
lord=248
henry=237
been=236
life=232
like=227
who=224
about=223
when=218
your=207
some=205
gray=205
them=203
will=201
"""