1
2
3
4 """Classes to connect to databases."""
5
6 from __future__ import absolute_import, unicode_literals
7 import sys
8 import sqlite3
9 import time
10 import re
11
12
16
18 return repr('Cannot open file "%s".' % self.value)
19
20
22 """Implement methods for accessing the databases.
23
24 G{classtree DatabaseConnector}
25 """
26
27 singleQuoteRegex = re.compile("'")
28
30 """DababaseConnector creator.
31
32 @param dbname:
33 Path to the database file.
34 @type dbname: str
35 @param maxN:
36 The n in the longer database n-grams table.
37 @type maxN: int
38 """
39 self.maxN = maxN
40 self.dbname = dbname
41
43 """Creates a table in the database to store n-gram of a given n.
44
45 @param n:
46 The n in n-gram. A table called [n]_gram (where [n] is the n
47 parameter) will be created.
48 @type n: int
49 """
50 query = 'CREATE TABLE IF NOT EXISTS _{0}_gram ('.format(n)
51 unique = ''
52 for i in reversed(range(n)):
53 if i != 0:
54 unique += 'word_{0}, '.format(i)
55 query += 'word_{0} TEXT, '.format(i)
56 else:
57 unique += 'word'
58 query += 'word TEXT, count INTEGER, UNIQUE({0}) );'.format(
59 unique)
60 self.execute_sql(query)
61
63 """Drop a n-gram table in the database.
64
65 @param n:
66 The n in n-gram.
67 @type n: int
68 """
69 query = 'DROP TABLE IF EXISTS _{0}_gram;'.format(n)
70 self.execute_sql(query)
71
73 """Create the index for the table storing n-gram of given n.
74
75 @param n:
76 The n in n-gram.
77 @type n: int
78 """
79 for i in reversed(range(n)):
80 if i != 0:
81 query = 'CREATE INDEX idx_{0}_gram_{1} ON'\
82 '_{0}_gram(word_{1});'.format(n, i)
83 self.execute_sql(query)
84
86 """'Drop the index for the table storing n-gram of given n.
87
88 @param n:
89 The n in n-gram.
90 @type n: int
91 """
92 for i in reversed(range(n)):
93 if i != 0:
94 query = 'DROP INDEX IF EXISTS idx_{0}_gram_{1};'.format(
95 n, i)
96 self.execute_sql(query)
97
98 - def ngrams(self, withCounts=False):
99 """Returns all ngrams that are in every tables of the database.
100
101 @note: The result is convert to tuple before being returned because it
102 is used as a dictionary key and only immutable types can be used
103 as dictionary key, so list cannot.
104
105 @param withCounts:
106 Indicate if the ngrams counts (number of occurences) should be
107 returned too.
108 @type withCounts: bool
109
110 @return:
111 The n-grams of each tables of the database.
112 @rtype: tuple
113 """
114 query = 'SELECT '
115 for i in reversed(range(self.maxN)):
116 if i != 0:
117 query += 'word_{0}, '.format(i)
118 elif i == 0:
119 query += 'word'
120 if withCounts:
121 query += ', count'
122 query += ' FROM _{0}_gram;'.format(self.maxN)
123 result = self.execute_sql(query)
124 for row in result:
125 yield tuple(row)
126
128 """Compute the occurences sum of every n-grams of given n in database.
129
130 @param n:
131 The n in n-gram.
132 @type n: int
133
134 @return:
135 The sum of the number of occurences of every n-grams in the n-grams
136 table of given n in the database.
137 @rtype: int
138 """
139 query = 'SELECT SUM(count) from _' + str(n) + '_gram;'
140 return self.extract_first_integer(self.execute_sql(query))
141
143 """Compute the number of n-grams in the n-grams table of given n.
144
145 @param n:
146 The n in n-gram.
147 @type n: int
148
149 @return:
150 The number of n-grams in the n-grams table of given n in the
151 database.
152 @rtype: int
153 """
154 table = '_' + str(n) + '_gram'
155 query = 'SELECT Count() from _' + str(n) + '_gram;'
156 return self.extract_first_integer(self.execute_sql(query))
157
159 """Retrieve the number of occurences of a given ngram in the database.
160
161 @param ngram:
162 The n-gram for which number of occurences must be retrieved.
163 @type ngram: list
164
165 @return:
166 The number of occurences of the n-gram or 0 if the n-gram is not
167 in the databade.
168 @rtype: int
169 """
170 query = 'SELECT count FROM _{0}_gram'.format(len(ngram))
171 query += self.make_where_clause(ngram) + ';'
172 return self.extract_first_integer(self.execute_sql(query))
173
175 """Retrieve the n-gram records which complete the given n-gram.
176
177 For instance, if ngram is::
178 ['on', 'the', 'ta']
179 Then the returned records would be somthing like::
180 [['on', 'the', 'table' ], 5]
181 [['on', 'the', 'take' ], 1]
182 [['on', 'the', 'taskbar'], 1]
183
184 @note: the query makes sure the n-grams are returned in descending order
185 according to their number of occurences. This is important
186 because the predictors predict() methods which would call this
187 method can limit their suggestion number so the most probable
188 suggestions (which are based on most frequent n-grams) must be
189 append to the list first.
190
191 @param ngram:
192 The n-gram words of the record to retrieve in the database.
193 @type ngram: list
194 @param limit:
195 Maximum number of records to retrieve.
196 @type limit: int
197
198 @return:
199 Return the n-grams records (n-gram words + number of occurences)
200 completing the n-gram in the database or an empty list if no n-grams
201 have been found.
202 @rtype: list
203 """
204 query = '{0} FROM _{1}_gram {2} ORDER BY count DESC'.format(
205 self.make_select_like_clause(len(ngram)), len(ngram),
206 self.make_where_like_clause(ngram))
207 if limit < 0:
208 query += ';'
209 else:
210 query += ' LIMIT ' + str(limit) + ';'
211 return self.execute_sql(query)
212
214 """Insert an n-gram with its number of occurences into the database.
215
216 @param ngram:
217 The n-gram to insert.
218 @type ngram: list
219 @param count:
220 The number of occurences of the n-gram to insert. It is usually 1 as
221 the n-grams should be inserted in the database only if they are not
222 already in it and they are added when they are read on the input
223 buffer.
224 @type count: int
225 """
226 query = 'INSERT INTO _{0}_gram {1};'.format(len(ngram),
227 self.make_values_clause(ngram, count))
228 self.execute_sql(query)
229
231 """Update an n-gram number of occurences in the database.
232
233 @warning: The ngram has to be in the database, otherwise this method
234 will fail.
235
236 @param ngram:
237 The n-gram to update (!!! should be in the database !!!).
238 @type ngram: list
239 @param count:
240 The number of occurences of the n-gram.
241 @type count: int
242 """
243 query = 'UPDATE _{0}_gram SET count = {1}'.format(len(ngram), count)
244 query += self.make_where_clause(ngram) + ';'
245 self.execute_sql(query)
246
248 """Remove a given ngram from the databae.
249
250 @warning: The ngram has to be in the database, otherwise this method
251 will fail.
252
253 @param ngram:
254 The n-gram to delete (!!! should be in the database !!!).
255 @type ngram: list
256 """
257 query = 'DELETE FROM _{0}_gram'.format(len(ngram))
258 query += self.make_where_clause(ngram) + ';'
259 self.execute_sql(query)
260
262 ng = [self.singleQuoteRegex.sub("''", n) for n in ngram]
263 valuesClause = "VALUES('" + "', '".join(ng) + "', {0})".format(count)
264 return valuesClause
265
267 whereClause = " WHERE"
268 for i in range(len(ngram)):
269 n = self.singleQuoteRegex.sub("''", ngram[i])
270 if i < (len(ngram) - 1):
271 whereClause += " word_{0} = '{1}' AND".format(
272 len(ngram) - i - 1, n)
273 else:
274 whereClause += " word = '{0}'".format(n)
275 return whereClause
276
278 result = "SELECT "
279 for i in reversed(range(1, n)):
280 result += "word_{0}, ". format(i)
281 result += "word, count"
282 return result
283
285 whereClause = " WHERE"
286 for i in range(len(ngram)):
287 if i < (len(ngram) - 1):
288 whereClause += " word_{0} = '{1}' AND".format(
289 len(ngram) - i - 1, ngram[i])
290 else:
291 whereClause += " word LIKE '{0}%'".format(ngram[-1])
292 return whereClause
293
301
302
304 """Database connector for sqlite databases.
305
306 G{classtree SqliteDatabaseConnector}
307 """
308
310 """SqliteDatabaseConnector creator.
311
312 @param dbname:
313 Path to the database file.
314 @type dbname: str
315 @param maxN:
316 The n in the longer database n-grams table.
317 @type maxN: int
318 """
319 DatabaseConnector.__init__(self, dbname, maxN)
320 self.con = None
321 self.open_database()
322
324 """Send a commit to the database."""
325 self.con.commit()
326
328 """Open the database."""
329 try:
330 self.con = sqlite3.connect(self.dbname)
331 except sqlite3.OperationalError:
332 lg.error("Cannot open database file '%s'" % (self.dbname))
333 raise OpenFileError(self.dbname)
334
336 """Close the database."""
337 if self.con:
338 self.con.close()
339
341 """Execute a query string on an open database.
342
343 @param query:
344 The query to execute.
345 @type query: str
346
347 @return:
348 The query result.
349 @rtype: list
350 """
351 c = self.con.cursor()
352 c.execute(query)
353 result = c.fetchall()
354 return result
355
356
357 -def insert_ngrams(ngramMap, n, outfile, append=False, createIndex=False,
358 callback=None):
359 """Insert every n-grams of the map in the database.
360
361 This function open a database and insert or update every n-grams of the
362 given list in it.
363
364 @param ngramMap:
365 The list of n-grams to insert or update.
366 @type ngramMap: list
367 @param n:
368 The n in n-gram.
369 @type n: int
370 @param outfile:
371 Path to the database.
372 @type outfile: str
373 @param append:
374 Indicate weither the n-gram should be append to the database. If the
375 database isn't empty, this function could raise conflict when trying
376 to insert an n-gram which is already in the database. When such
377 conflicts happen the append value is important:
378 - True: conflicting records count values are updated.
379 - False: conflicting records are replaced by new records.
380 @type append: bool
381 @param createIndex:
382 Indicate weither the database table indexes should be created.
383 @type createIndex: bool
384 @param callback:
385 The callback is used to show the progress percentage. In the gui a
386 callback method is implemented to update a progress bar showing the
387 n-grams insertion progress (cf. gui.py).
388 @type callback: fun(float, ...)
389 """
390 progress = 0
391 sql = SqliteDatabaseConnector(outfile, n)
392 sql.crt_ngram_table(n)
393 for ngram, count in ngramMap.items():
394 if append:
395 oldCount = sql.ngram_count(ngram)
396 if oldCount > 0:
397 sql.update_ngram(ngram, oldCount + count)
398 else:
399 sql.insert_ngram(ngram, count)
400 else:
401 sql.remove_ngram(ngram)
402 sql.insert_ngram(ngram, count)
403 progress += 100 / len(ngramMap)
404 if callback:
405 callback(progress)
406 sql.commit()
407 if createIndex and not append:
408 sql.crt_index(n)
409 sql.close_database()
410