Package prest :: Module db
[hide private]
[frames] | no frames]

Source Code for Module prest.db

  1  #!/usr/bin/env python3 
  2  # -*- coding: utf-8 -*- 
  3   
  4  """Classes to connect to databases.""" 
  5   
  6  from __future__ import absolute_import, unicode_literals 
  7  import sys 
  8  import sqlite3 
  9  import time 
 10  import re 
 11   
 12   
13 -class OpenFileError(Exception):
14 - def __init__(self, value):
15 self.value = value
16
17 - def __str__(self):
18 return repr('Cannot open file "%s".' % self.value)
19 20
21 -class DatabaseConnector(object):
22 """Implement methods for accessing the databases. 23 24 G{classtree DatabaseConnector} 25 """ 26 27 singleQuoteRegex = re.compile("'") 28
29 - def __init__(self, dbname, maxN=1):
30 """DababaseConnector creator. 31 32 @param dbname: 33 Path to the database file. 34 @type dbname: str 35 @param maxN: 36 The n in the longer database n-grams table. 37 @type maxN: int 38 """ 39 self.maxN = maxN 40 self.dbname = dbname
41
42 - def crt_ngram_table(self, n=1):
43 """Creates a table in the database to store n-gram of a given n. 44 45 @param n: 46 The n in n-gram. A table called [n]_gram (where [n] is the n 47 parameter) will be created. 48 @type n: int 49 """ 50 query = 'CREATE TABLE IF NOT EXISTS _{0}_gram ('.format(n) 51 unique = '' 52 for i in reversed(range(n)): 53 if i != 0: 54 unique += 'word_{0}, '.format(i) 55 query += 'word_{0} TEXT, '.format(i) 56 else: 57 unique += 'word' 58 query += 'word TEXT, count INTEGER, UNIQUE({0}) );'.format( 59 unique) 60 self.execute_sql(query)
61
62 - def drop_ngram_table(self, n=1):
63 """Drop a n-gram table in the database. 64 65 @param n: 66 The n in n-gram. 67 @type n: int 68 """ 69 query = 'DROP TABLE IF EXISTS _{0}_gram;'.format(n) 70 self.execute_sql(query)
71
72 - def crt_index(self, n):
73 """Create the index for the table storing n-gram of given n. 74 75 @param n: 76 The n in n-gram. 77 @type n: int 78 """ 79 for i in reversed(range(n)): 80 if i != 0: 81 query = 'CREATE INDEX idx_{0}_gram_{1} ON'\ 82 '_{0}_gram(word_{1});'.format(n, i) 83 self.execute_sql(query)
84
85 - def dlt_index(self, n):
86 """'Drop the index for the table storing n-gram of given n. 87 88 @param n: 89 The n in n-gram. 90 @type n: int 91 """ 92 for i in reversed(range(n)): 93 if i != 0: 94 query = 'DROP INDEX IF EXISTS idx_{0}_gram_{1};'.format( 95 n, i) 96 self.execute_sql(query)
97
98 - def ngrams(self, withCounts=False):
99 """Returns all ngrams that are in every tables of the database. 100 101 @note: The result is convert to tuple before being returned because it 102 is used as a dictionary key and only immutable types can be used 103 as dictionary key, so list cannot. 104 105 @param withCounts: 106 Indicate if the ngrams counts (number of occurences) should be 107 returned too. 108 @type withCounts: bool 109 110 @return: 111 The n-grams of each tables of the database. 112 @rtype: tuple 113 """ 114 query = 'SELECT ' 115 for i in reversed(range(self.maxN)): 116 if i != 0: 117 query += 'word_{0}, '.format(i) 118 elif i == 0: 119 query += 'word' 120 if withCounts: 121 query += ', count' 122 query += ' FROM _{0}_gram;'.format(self.maxN) 123 result = self.execute_sql(query) 124 for row in result: 125 yield tuple(row)
126
127 - def sum_ngrams_occ(self, n):
128 """Compute the occurences sum of every n-grams of given n in database. 129 130 @param n: 131 The n in n-gram. 132 @type n: int 133 134 @return: 135 The sum of the number of occurences of every n-grams in the n-grams 136 table of given n in the database. 137 @rtype: int 138 """ 139 query = 'SELECT SUM(count) from _' + str(n) + '_gram;' 140 return self.extract_first_integer(self.execute_sql(query))
141
142 - def ngrams_in_table(self, n):
143 """Compute the number of n-grams in the n-grams table of given n. 144 145 @param n: 146 The n in n-gram. 147 @type n: int 148 149 @return: 150 The number of n-grams in the n-grams table of given n in the 151 database. 152 @rtype: int 153 """ 154 table = '_' + str(n) + '_gram' 155 query = 'SELECT Count() from _' + str(n) + '_gram;' 156 return self.extract_first_integer(self.execute_sql(query))
157
158 - def ngram_count(self, ngram):
159 """Retrieve the number of occurences of a given ngram in the database. 160 161 @param ngram: 162 The n-gram for which number of occurences must be retrieved. 163 @type ngram: list 164 165 @return: 166 The number of occurences of the n-gram or 0 if the n-gram is not 167 in the databade. 168 @rtype: int 169 """ 170 query = 'SELECT count FROM _{0}_gram'.format(len(ngram)) 171 query += self.make_where_clause(ngram) + ';' 172 return self.extract_first_integer(self.execute_sql(query))
173
174 - def ngram_table_tp(self, ngram, limit=-1):
175 """Retrieve the n-gram records which complete the given n-gram. 176 177 For instance, if ngram is:: 178 ['on', 'the', 'ta'] 179 Then the returned records would be somthing like:: 180 [['on', 'the', 'table' ], 5] 181 [['on', 'the', 'take' ], 1] 182 [['on', 'the', 'taskbar'], 1] 183 184 @note: the query makes sure the n-grams are returned in descending order 185 according to their number of occurences. This is important 186 because the predictors predict() methods which would call this 187 method can limit their suggestion number so the most probable 188 suggestions (which are based on most frequent n-grams) must be 189 append to the list first. 190 191 @param ngram: 192 The n-gram words of the record to retrieve in the database. 193 @type ngram: list 194 @param limit: 195 Maximum number of records to retrieve. 196 @type limit: int 197 198 @return: 199 Return the n-grams records (n-gram words + number of occurences) 200 completing the n-gram in the database or an empty list if no n-grams 201 have been found. 202 @rtype: list 203 """ 204 query = '{0} FROM _{1}_gram {2} ORDER BY count DESC'.format( 205 self.make_select_like_clause(len(ngram)), len(ngram), 206 self.make_where_like_clause(ngram)) 207 if limit < 0: 208 query += ';' 209 else: 210 query += ' LIMIT ' + str(limit) + ';' 211 return self.execute_sql(query)
212
213 - def insert_ngram(self, ngram, count):
214 """Insert an n-gram with its number of occurences into the database. 215 216 @param ngram: 217 The n-gram to insert. 218 @type ngram: list 219 @param count: 220 The number of occurences of the n-gram to insert. It is usually 1 as 221 the n-grams should be inserted in the database only if they are not 222 already in it and they are added when they are read on the input 223 buffer. 224 @type count: int 225 """ 226 query = 'INSERT INTO _{0}_gram {1};'.format(len(ngram), 227 self.make_values_clause(ngram, count)) 228 self.execute_sql(query)
229
230 - def update_ngram(self, ngram, count):
231 """Update an n-gram number of occurences in the database. 232 233 @warning: The ngram has to be in the database, otherwise this method 234 will fail. 235 236 @param ngram: 237 The n-gram to update (!!! should be in the database !!!). 238 @type ngram: list 239 @param count: 240 The number of occurences of the n-gram. 241 @type count: int 242 """ 243 query = 'UPDATE _{0}_gram SET count = {1}'.format(len(ngram), count) 244 query += self.make_where_clause(ngram) + ';' 245 self.execute_sql(query)
246
247 - def remove_ngram(self, ngram):
248 """Remove a given ngram from the databae. 249 250 @warning: The ngram has to be in the database, otherwise this method 251 will fail. 252 253 @param ngram: 254 The n-gram to delete (!!! should be in the database !!!). 255 @type ngram: list 256 """ 257 query = 'DELETE FROM _{0}_gram'.format(len(ngram)) 258 query += self.make_where_clause(ngram) + ';' 259 self.execute_sql(query)
260
261 - def make_values_clause(self, ngram, count):
262 ng = [self.singleQuoteRegex.sub("''", n) for n in ngram] 263 valuesClause = "VALUES('" + "', '".join(ng) + "', {0})".format(count) 264 return valuesClause
265
266 - def make_where_clause(self, ngram):
267 whereClause = " WHERE" 268 for i in range(len(ngram)): 269 n = self.singleQuoteRegex.sub("''", ngram[i]) 270 if i < (len(ngram) - 1): 271 whereClause += " word_{0} = '{1}' AND".format( 272 len(ngram) - i - 1, n) 273 else: 274 whereClause += " word = '{0}'".format(n) 275 return whereClause
276
277 - def make_select_like_clause(self, n):
278 result = "SELECT " 279 for i in reversed(range(1, n)): 280 result += "word_{0}, ". format(i) 281 result += "word, count" 282 return result
283
284 - def make_where_like_clause(self, ngram):
285 whereClause = " WHERE" 286 for i in range(len(ngram)): 287 if i < (len(ngram) - 1): 288 whereClause += " word_{0} = '{1}' AND".format( 289 len(ngram) - i - 1, ngram[i]) 290 else: 291 whereClause += " word LIKE '{0}%'".format(ngram[-1]) 292 return whereClause
293
294 - def extract_first_integer(self, table):
295 count = 0 296 if table and table[0]: 297 count = int(table[0][0]) 298 if not count > 0: 299 count = 0 300 return count
301 302
303 -class SqliteDatabaseConnector(DatabaseConnector):
304 """Database connector for sqlite databases. 305 306 G{classtree SqliteDatabaseConnector} 307 """ 308
309 - def __init__(self, dbname, maxN=1):
310 """SqliteDatabaseConnector creator. 311 312 @param dbname: 313 Path to the database file. 314 @type dbname: str 315 @param maxN: 316 The n in the longer database n-grams table. 317 @type maxN: int 318 """ 319 DatabaseConnector.__init__(self, dbname, maxN) 320 self.con = None 321 self.open_database()
322
323 - def commit(self):
324 """Send a commit to the database.""" 325 self.con.commit()
326
327 - def open_database(self):
328 """Open the database.""" 329 try: 330 self.con = sqlite3.connect(self.dbname) 331 except sqlite3.OperationalError: 332 lg.error("Cannot open database file '%s'" % (self.dbname)) 333 raise OpenFileError(self.dbname)
334
335 - def close_database(self):
336 """Close the database.""" 337 if self.con: 338 self.con.close()
339
340 - def execute_sql(self, query):
341 """Execute a query string on an open database. 342 343 @param query: 344 The query to execute. 345 @type query: str 346 347 @return: 348 The query result. 349 @rtype: list 350 """ 351 c = self.con.cursor() 352 c.execute(query) 353 result = c.fetchall() 354 return result
355 356
357 -def insert_ngrams(ngramMap, n, outfile, append=False, createIndex=False, 358 callback=None):
359 """Insert every n-grams of the map in the database. 360 361 This function open a database and insert or update every n-grams of the 362 given list in it. 363 364 @param ngramMap: 365 The list of n-grams to insert or update. 366 @type ngramMap: list 367 @param n: 368 The n in n-gram. 369 @type n: int 370 @param outfile: 371 Path to the database. 372 @type outfile: str 373 @param append: 374 Indicate weither the n-gram should be append to the database. If the 375 database isn't empty, this function could raise conflict when trying 376 to insert an n-gram which is already in the database. When such 377 conflicts happen the append value is important: 378 - True: conflicting records count values are updated. 379 - False: conflicting records are replaced by new records. 380 @type append: bool 381 @param createIndex: 382 Indicate weither the database table indexes should be created. 383 @type createIndex: bool 384 @param callback: 385 The callback is used to show the progress percentage. In the gui a 386 callback method is implemented to update a progress bar showing the 387 n-grams insertion progress (cf. gui.py). 388 @type callback: fun(float, ...) 389 """ 390 progress = 0 391 sql = SqliteDatabaseConnector(outfile, n) 392 sql.crt_ngram_table(n) 393 for ngram, count in ngramMap.items(): 394 if append: 395 oldCount = sql.ngram_count(ngram) 396 if oldCount > 0: 397 sql.update_ngram(ngram, oldCount + count) 398 else: 399 sql.insert_ngram(ngram, count) 400 else: 401 sql.remove_ngram(ngram) # avoid IntegrityError 402 sql.insert_ngram(ngram, count) 403 progress += 100 / len(ngramMap) 404 if callback: 405 callback(progress) 406 sql.commit() 407 if createIndex and not append: 408 sql.crt_index(n) 409 sql.close_database()
410