Package tipy :: Module db
[hide private]
[frames] | no frames]

Source Code for Module tipy.db

  1  #!/usr/bin/env python3 
  2  # -*- coding: utf-8 -*- 
  3   
  4  """Classes to connect to databases.""" 
  5   
  6  from sqlite3 import connect, OperationalError 
  7  from re import compile 
  8  from tipy.lg import lg 
  9   
 10   
11 -class OpenFileError(Exception):
12 - def __init__(self, value):
13 self.value = value
14
15 - def __str__(self):
16 return repr('Cannot open file "%s".' % self.value)
17 18
19 -class DatabaseConnector(object):
20 """Implement methods for accessing the databases. 21 22 G{classtree DatabaseConnector} 23 """ 24 25 singleQuoteRegex = compile("'") 26
27 - def __init__(self, dbname, maxN=1):
28 """DababaseConnector creator. 29 30 @param dbname: 31 Path to the database file. 32 @type dbname: str 33 @param maxN: 34 The n in the longer database n-grams table. 35 @type maxN: int 36 """ 37 self.maxN = maxN 38 self.dbname = dbname
39
40 - def crt_ngram_table(self, n=1):
41 """Creates a table in the database to store n-gram of a given n. 42 43 @param n: 44 The n in n-gram. A table called [n]_gram (where [n] is the n 45 parameter) will be created. 46 @type n: int 47 """ 48 query = 'CREATE TABLE IF NOT EXISTS _{0}_gram ('.format(n) 49 unique = '' 50 for i in reversed(range(n)): 51 if i != 0: 52 unique += 'word_{0}, '.format(i) 53 query += 'word_{0} TEXT, '.format(i) 54 else: 55 unique += 'word' 56 query += 'word TEXT, count INTEGER, UNIQUE({0}) );'.format( 57 unique) 58 self.execute_sql(query)
59
60 - def drop_ngram_table(self, n=1):
61 """Drop a n-gram table in the database. 62 63 @param n: 64 The n in n-gram. 65 @type n: int 66 """ 67 query = 'DROP TABLE IF EXISTS _{0}_gram;'.format(n) 68 self.execute_sql(query)
69
70 - def crt_index(self, n):
71 """Create the index for the table storing n-gram of given n. 72 73 @param n: 74 The n in n-gram. 75 @type n: int 76 """ 77 for i in reversed(range(n)): 78 if i != 0: 79 query = 'CREATE INDEX IF NOT EXISTS idx_{0}_gram_{1} ON '\ 80 '_{0}_gram(word_{1});'.format(n, i) 81 self.execute_sql(query)
82
83 - def dlt_index(self, n):
84 """Drop the index for the table storing n-gram of given n. 85 86 @param n: 87 The n in n-gram. 88 @type n: int 89 """ 90 for i in reversed(range(n)): 91 if i != 0: 92 query = 'DROP INDEX IF EXISTS idx_{0}_gram_{1};'.format( 93 n, i) 94 self.execute_sql(query)
95
97 """Create the database tables and indexes if they don't exists. 98 99 This is usefull to avoid crash when trying to hit an unexisting table. 100 """ 101 for n in range(1, int(self.maxN) + 1): 102 self.crt_ngram_table(n) 103 self.crt_index(n)
104
105 - def ngrams(self, withCounts=False):
106 """Returns all ngrams that are in every tables of the database. 107 108 @note: The result is convert to tuple before being returned because it 109 is used as a dictionary key and only immutable types can be used 110 as dictionary key, so list cannot. 111 112 @param withCounts: 113 Indicate if the ngrams counts (number of occurences) should be 114 returned too. 115 @type withCounts: bool 116 117 @return: 118 The n-grams of each tables of the database. 119 @rtype: tuple 120 """ 121 query = 'SELECT ' 122 for i in reversed(range(self.maxN)): 123 if i != 0: 124 query += 'word_{0}, '.format(i) 125 elif i == 0: 126 query += 'word' 127 if withCounts: 128 query += ', count' 129 query += ' FROM _{0}_gram;'.format(self.maxN) 130 result = self.execute_sql(query) 131 for row in result: 132 yield tuple(row)
133
134 - def sum_ngrams_occ(self, n):
135 """Compute the occurences sum of every n-grams of given n in database. 136 137 @param n: 138 The n in n-gram. 139 @type n: int 140 141 @return: 142 The sum of the number of occurences of every n-grams in the n-grams 143 table of given n in the database. 144 @rtype: int 145 """ 146 query = 'SELECT SUM(count) from _' + str(n) + '_gram;' 147 return self.extract_first_integer(self.execute_sql(query))
148
149 - def ngrams_in_table(self, n):
150 """Compute the number of n-grams in the n-grams table of given n. 151 152 @param n: 153 The n in n-gram. 154 @type n: int 155 156 @return: 157 The number of n-grams in the n-grams table of given n in the 158 database. 159 @rtype: int 160 """ 161 table = '_' + str(n) + '_gram' 162 query = 'SELECT Count() from _' + str(n) + '_gram;' 163 return self.extract_first_integer(self.execute_sql(query))
164
165 - def ngram_count(self, ngram):
166 """Retrieve the number of occurences of a given ngram in the database. 167 168 @param ngram: 169 The n-gram for which number of occurences must be retrieved. 170 @type ngram: list 171 172 @return: 173 The number of occurences of the n-gram or 0 if the n-gram is not 174 in the databade. 175 @rtype: int 176 """ 177 query = 'SELECT count FROM _{0}_gram'.format(len(ngram)) 178 query += self.make_where_clause(ngram) + ';' 179 return self.extract_first_integer(self.execute_sql(query))
180
181 - def ngram_table_tp(self, ngram, limit=-1):
182 """Retrieve the n-gram records which complete the given n-gram. 183 184 For instance, if ngram is:: 185 ['on', 'the', 'ta'] 186 Then the returned records would be somthing like:: 187 [['on', 'the', 'table' ], 5] 188 [['on', 'the', 'take' ], 1] 189 [['on', 'the', 'taskbar'], 1] 190 191 @note: the query makes sure the n-grams are returned in descending order 192 according to their number of occurences. This is important 193 because the predictors predict() methods which would call this 194 method can limit their suggestion number so the most probable 195 suggestions (which are based on most frequent n-grams) must be 196 append to the list first. 197 198 @param ngram: 199 The n-gram words of the record to retrieve in the database. 200 @type ngram: list 201 @param limit: 202 Maximum number of records to retrieve. 203 @type limit: int 204 205 @return: 206 Return the n-grams records (n-gram words + number of occurences) 207 completing the n-gram in the database or an empty list if no n-grams 208 have been found. 209 @rtype: list 210 """ 211 query = '{0} FROM _{1}_gram {2} ORDER BY count DESC'.format( 212 self.make_select_like_clause(len(ngram)), len(ngram), 213 self.make_where_like_clause(ngram)) 214 if limit < 0: 215 query += ';' 216 else: 217 query += ' LIMIT ' + str(limit) + ';' 218 return self.execute_sql(query)
219
220 - def insert_ngram(self, ngram, count):
221 """Insert an n-gram with its number of occurences into the database. 222 223 @param ngram: 224 The n-gram to insert. 225 @type ngram: list 226 @param count: 227 The number of occurences of the n-gram to insert. It is usually 1 as 228 the n-grams should be inserted in the database only if they are not 229 already in it and they are added when they are read on the input 230 buffer. 231 @type count: int 232 """ 233 query = 'INSERT INTO _{0}_gram {1};'.format(len(ngram), 234 self.make_values_clause(ngram, count)) 235 self.execute_sql(query)
236
237 - def update_ngram(self, ngram, count):
238 """Update an n-gram number of occurences in the database. 239 240 @warning: The ngram has to be in the database, otherwise this method 241 will fail. 242 243 @param ngram: 244 The n-gram to update (!!! should be in the database !!!). 245 @type ngram: list 246 @param count: 247 The number of occurences of the n-gram. 248 @type count: int 249 """ 250 query = 'UPDATE _{0}_gram SET count = {1}'.format(len(ngram), count) 251 query += self.make_where_clause(ngram) + ';' 252 self.execute_sql(query)
253
254 - def remove_ngram(self, ngram):
255 """Remove a given ngram from the databae. 256 257 @warning: The ngram has to be in the database, otherwise this method 258 will fail. 259 260 @param ngram: 261 The n-gram to delete (!!! should be in the database !!!). 262 @type ngram: list 263 """ 264 query = 'DELETE FROM _{0}_gram'.format(len(ngram)) 265 query += self.make_where_clause(ngram) + ';' 266 self.execute_sql(query)
267
268 - def make_values_clause(self, ngram, count):
269 ng = [self.singleQuoteRegex.sub("''", n) for n in ngram] 270 valuesClause = "VALUES('" + "', '".join(ng) + "', {0})".format(count) 271 return valuesClause
272
273 - def make_where_clause(self, ngram):
274 whereClause = " WHERE" 275 for i in range(len(ngram)): 276 n = self.singleQuoteRegex.sub("''", ngram[i]) 277 if i < (len(ngram) - 1): 278 whereClause += " word_{0} = '{1}' AND".format( 279 len(ngram) - i - 1, n) 280 else: 281 whereClause += " word = '{0}'".format(n) 282 return whereClause
283
284 - def make_select_like_clause(self, n):
285 result = "SELECT " 286 for i in reversed(range(1, n)): 287 result += "word_{0}, ". format(i) 288 result += "word, count" 289 return result
290
291 - def make_where_like_clause(self, ngram):
292 whereClause = " WHERE" 293 for i in range(len(ngram)): 294 if i < (len(ngram) - 1): 295 whereClause += " word_{0} = '{1}' AND".format( 296 len(ngram) - i - 1, ngram[i]) 297 else: 298 whereClause += " word LIKE '{0}%'".format(ngram[-1]) 299 return whereClause
300
301 - def extract_first_integer(self, table):
302 count = 0 303 if table and table[0] and table[0][0]: 304 count = int(table[0][0]) 305 if not count > 0: 306 count = 0 307 return count
308 309
310 -class SqliteDatabaseConnector(DatabaseConnector):
311 """Database connector for sqlite databases. 312 313 G{classtree SqliteDatabaseConnector} 314 """ 315
316 - def __init__(self, dbname, maxN=1):
317 """SqliteDatabaseConnector creator. 318 319 @param dbname: 320 Path to the database file. 321 @type dbname: str 322 @param maxN: 323 The n in the longer database n-grams table. 324 @type maxN: int 325 """ 326 DatabaseConnector.__init__(self, dbname, maxN) 327 self.con = None 328 self.open_database() 329 self.create_table_if_not_exists()
330
331 - def commit(self):
332 """Send a commit to the database.""" 333 self.con.commit()
334
335 - def open_database(self):
336 """Open the database.""" 337 try: 338 self.con = connect(self.dbname) 339 except OperationalError: 340 lg.error("Cannot open database file '%s'" % (self.dbname)) 341 raise OpenFileError(self.dbname)
342
343 - def close_database(self):
344 """Close the database.""" 345 if self.con: 346 self.con.close()
347
348 - def execute_sql(self, query):
349 """Execute a query string on an open database. 350 351 @param query: 352 The query to execute. 353 @type query: str 354 355 @return: 356 The query result. 357 @rtype: list 358 """ 359 c = self.con.cursor() 360 c.execute(query) 361 result = c.fetchall() 362 return result
363 364
365 -def insert_ngrams(ngramMap, n, outfile, append=False, createIndex=False, 366 callback=None):
367 """Insert every n-grams of the map in the database. 368 369 This function open a database and insert or update every n-grams of the 370 given list in it. 371 372 @param ngramMap: 373 The list of n-grams to insert or update. 374 @type ngramMap: list 375 @param n: 376 The n in n-gram. 377 @type n: int 378 @param outfile: 379 Path to the database. 380 @type outfile: str 381 @param append: 382 Indicate weither the n-gram should be append to the database. If the 383 database isn't empty, this function could raise conflict when trying 384 to insert an n-gram which is already in the database. When such 385 conflicts happen the append value is important: 386 - True: conflicting records count values are updated. 387 - False: conflicting records are replaced by new records. 388 @type append: bool 389 @param createIndex: 390 Indicate weither the database table indexes should be created. 391 @type createIndex: bool 392 @param callback: 393 The callback is used to show the progress percentage. In the gui a 394 callback method is implemented to update a progress bar showing the 395 n-grams insertion progress (cf. py). 396 @type callback: fun(float, ...) 397 """ 398 progress = 0 399 sql = SqliteDatabaseConnector(outfile, n) 400 sql.crt_ngram_table(n) 401 for ngram, count in ngramMap.items(): 402 if append: 403 oldCount = sql.ngram_count(ngram) 404 if oldCount > 0: 405 sql.update_ngram(ngram, oldCount + count) 406 else: 407 sql.insert_ngram(ngram, count) 408 else: 409 sql.remove_ngram(ngram) # avoid IntegrityError 410 sql.insert_ngram(ngram, count) 411 progress += 100 / len(ngramMap) 412 if callback: 413 callback(progress) 414 sql.commit() 415 if createIndex and not append: 416 sql.crt_index(n) 417 sql.close_database()
418