1
2
3
4 """Classes to connect to databases."""
5
6 from sqlite3 import connect, OperationalError
7 from re import compile
8 from tipy.lg import lg
9
10
14
16 return repr('Cannot open file "%s".' % self.value)
17
18
20 """Implement methods for accessing the databases.
21
22 G{classtree DatabaseConnector}
23 """
24
25 singleQuoteRegex = compile("'")
26
28 """DababaseConnector creator.
29
30 @param dbname:
31 Path to the database file.
32 @type dbname: str
33 @param maxN:
34 The n in the longer database n-grams table.
35 @type maxN: int
36 """
37 self.maxN = maxN
38 self.dbname = dbname
39
41 """Creates a table in the database to store n-gram of a given n.
42
43 @param n:
44 The n in n-gram. A table called [n]_gram (where [n] is the n
45 parameter) will be created.
46 @type n: int
47 """
48 query = 'CREATE TABLE IF NOT EXISTS _{0}_gram ('.format(n)
49 unique = ''
50 for i in reversed(range(n)):
51 if i != 0:
52 unique += 'word_{0}, '.format(i)
53 query += 'word_{0} TEXT, '.format(i)
54 else:
55 unique += 'word'
56 query += 'word TEXT, count INTEGER, UNIQUE({0}) );'.format(
57 unique)
58 self.execute_sql(query)
59
61 """Drop a n-gram table in the database.
62
63 @param n:
64 The n in n-gram.
65 @type n: int
66 """
67 query = 'DROP TABLE IF EXISTS _{0}_gram;'.format(n)
68 self.execute_sql(query)
69
71 """Create the index for the table storing n-gram of given n.
72
73 @param n:
74 The n in n-gram.
75 @type n: int
76 """
77 for i in reversed(range(n)):
78 if i != 0:
79 query = 'CREATE INDEX IF NOT EXISTS idx_{0}_gram_{1} ON '\
80 '_{0}_gram(word_{1});'.format(n, i)
81 self.execute_sql(query)
82
84 """Drop the index for the table storing n-gram of given n.
85
86 @param n:
87 The n in n-gram.
88 @type n: int
89 """
90 for i in reversed(range(n)):
91 if i != 0:
92 query = 'DROP INDEX IF EXISTS idx_{0}_gram_{1};'.format(
93 n, i)
94 self.execute_sql(query)
95
97 """Create the database tables and indexes if they don't exists.
98
99 This is usefull to avoid crash when trying to hit an unexisting table.
100 """
101 for n in range(1, int(self.maxN) + 1):
102 self.crt_ngram_table(n)
103 self.crt_index(n)
104
105 - def ngrams(self, withCounts=False):
106 """Returns all ngrams that are in every tables of the database.
107
108 @note: The result is convert to tuple before being returned because it
109 is used as a dictionary key and only immutable types can be used
110 as dictionary key, so list cannot.
111
112 @param withCounts:
113 Indicate if the ngrams counts (number of occurences) should be
114 returned too.
115 @type withCounts: bool
116
117 @return:
118 The n-grams of each tables of the database.
119 @rtype: tuple
120 """
121 query = 'SELECT '
122 for i in reversed(range(self.maxN)):
123 if i != 0:
124 query += 'word_{0}, '.format(i)
125 elif i == 0:
126 query += 'word'
127 if withCounts:
128 query += ', count'
129 query += ' FROM _{0}_gram;'.format(self.maxN)
130 result = self.execute_sql(query)
131 for row in result:
132 yield tuple(row)
133
135 """Compute the occurences sum of every n-grams of given n in database.
136
137 @param n:
138 The n in n-gram.
139 @type n: int
140
141 @return:
142 The sum of the number of occurences of every n-grams in the n-grams
143 table of given n in the database.
144 @rtype: int
145 """
146 query = 'SELECT SUM(count) from _' + str(n) + '_gram;'
147 return self.extract_first_integer(self.execute_sql(query))
148
150 """Compute the number of n-grams in the n-grams table of given n.
151
152 @param n:
153 The n in n-gram.
154 @type n: int
155
156 @return:
157 The number of n-grams in the n-grams table of given n in the
158 database.
159 @rtype: int
160 """
161 table = '_' + str(n) + '_gram'
162 query = 'SELECT Count() from _' + str(n) + '_gram;'
163 return self.extract_first_integer(self.execute_sql(query))
164
166 """Retrieve the number of occurences of a given ngram in the database.
167
168 @param ngram:
169 The n-gram for which number of occurences must be retrieved.
170 @type ngram: list
171
172 @return:
173 The number of occurences of the n-gram or 0 if the n-gram is not
174 in the databade.
175 @rtype: int
176 """
177 query = 'SELECT count FROM _{0}_gram'.format(len(ngram))
178 query += self.make_where_clause(ngram) + ';'
179 return self.extract_first_integer(self.execute_sql(query))
180
182 """Retrieve the n-gram records which complete the given n-gram.
183
184 For instance, if ngram is::
185 ['on', 'the', 'ta']
186 Then the returned records would be somthing like::
187 [['on', 'the', 'table' ], 5]
188 [['on', 'the', 'take' ], 1]
189 [['on', 'the', 'taskbar'], 1]
190
191 @note: the query makes sure the n-grams are returned in descending order
192 according to their number of occurences. This is important
193 because the predictors predict() methods which would call this
194 method can limit their suggestion number so the most probable
195 suggestions (which are based on most frequent n-grams) must be
196 append to the list first.
197
198 @param ngram:
199 The n-gram words of the record to retrieve in the database.
200 @type ngram: list
201 @param limit:
202 Maximum number of records to retrieve.
203 @type limit: int
204
205 @return:
206 Return the n-grams records (n-gram words + number of occurences)
207 completing the n-gram in the database or an empty list if no n-grams
208 have been found.
209 @rtype: list
210 """
211 query = '{0} FROM _{1}_gram {2} ORDER BY count DESC'.format(
212 self.make_select_like_clause(len(ngram)), len(ngram),
213 self.make_where_like_clause(ngram))
214 if limit < 0:
215 query += ';'
216 else:
217 query += ' LIMIT ' + str(limit) + ';'
218 return self.execute_sql(query)
219
221 """Insert an n-gram with its number of occurences into the database.
222
223 @param ngram:
224 The n-gram to insert.
225 @type ngram: list
226 @param count:
227 The number of occurences of the n-gram to insert. It is usually 1 as
228 the n-grams should be inserted in the database only if they are not
229 already in it and they are added when they are read on the input
230 buffer.
231 @type count: int
232 """
233 query = 'INSERT INTO _{0}_gram {1};'.format(len(ngram),
234 self.make_values_clause(ngram, count))
235 self.execute_sql(query)
236
238 """Update an n-gram number of occurences in the database.
239
240 @warning: The ngram has to be in the database, otherwise this method
241 will fail.
242
243 @param ngram:
244 The n-gram to update (!!! should be in the database !!!).
245 @type ngram: list
246 @param count:
247 The number of occurences of the n-gram.
248 @type count: int
249 """
250 query = 'UPDATE _{0}_gram SET count = {1}'.format(len(ngram), count)
251 query += self.make_where_clause(ngram) + ';'
252 self.execute_sql(query)
253
255 """Remove a given ngram from the databae.
256
257 @warning: The ngram has to be in the database, otherwise this method
258 will fail.
259
260 @param ngram:
261 The n-gram to delete (!!! should be in the database !!!).
262 @type ngram: list
263 """
264 query = 'DELETE FROM _{0}_gram'.format(len(ngram))
265 query += self.make_where_clause(ngram) + ';'
266 self.execute_sql(query)
267
269 ng = [self.singleQuoteRegex.sub("''", n) for n in ngram]
270 valuesClause = "VALUES('" + "', '".join(ng) + "', {0})".format(count)
271 return valuesClause
272
274 whereClause = " WHERE"
275 for i in range(len(ngram)):
276 n = self.singleQuoteRegex.sub("''", ngram[i])
277 if i < (len(ngram) - 1):
278 whereClause += " word_{0} = '{1}' AND".format(
279 len(ngram) - i - 1, n)
280 else:
281 whereClause += " word = '{0}'".format(n)
282 return whereClause
283
285 result = "SELECT "
286 for i in reversed(range(1, n)):
287 result += "word_{0}, ". format(i)
288 result += "word, count"
289 return result
290
292 whereClause = " WHERE"
293 for i in range(len(ngram)):
294 if i < (len(ngram) - 1):
295 whereClause += " word_{0} = '{1}' AND".format(
296 len(ngram) - i - 1, ngram[i])
297 else:
298 whereClause += " word LIKE '{0}%'".format(ngram[-1])
299 return whereClause
300
302 count = 0
303 if table and table[0] and table[0][0]:
304 count = int(table[0][0])
305 if not count > 0:
306 count = 0
307 return count
308
309
311 """Database connector for sqlite databases.
312
313 G{classtree SqliteDatabaseConnector}
314 """
315
317 """SqliteDatabaseConnector creator.
318
319 @param dbname:
320 Path to the database file.
321 @type dbname: str
322 @param maxN:
323 The n in the longer database n-grams table.
324 @type maxN: int
325 """
326 DatabaseConnector.__init__(self, dbname, maxN)
327 self.con = None
328 self.open_database()
329 self.create_table_if_not_exists()
330
332 """Send a commit to the database."""
333 self.con.commit()
334
336 """Open the database."""
337 try:
338 self.con = connect(self.dbname)
339 except OperationalError:
340 lg.error("Cannot open database file '%s'" % (self.dbname))
341 raise OpenFileError(self.dbname)
342
344 """Close the database."""
345 if self.con:
346 self.con.close()
347
349 """Execute a query string on an open database.
350
351 @param query:
352 The query to execute.
353 @type query: str
354
355 @return:
356 The query result.
357 @rtype: list
358 """
359 c = self.con.cursor()
360 c.execute(query)
361 result = c.fetchall()
362 return result
363
364
365 -def insert_ngrams(ngramMap, n, outfile, append=False, createIndex=False,
366 callback=None):
367 """Insert every n-grams of the map in the database.
368
369 This function open a database and insert or update every n-grams of the
370 given list in it.
371
372 @param ngramMap:
373 The list of n-grams to insert or update.
374 @type ngramMap: list
375 @param n:
376 The n in n-gram.
377 @type n: int
378 @param outfile:
379 Path to the database.
380 @type outfile: str
381 @param append:
382 Indicate weither the n-gram should be append to the database. If the
383 database isn't empty, this function could raise conflict when trying
384 to insert an n-gram which is already in the database. When such
385 conflicts happen the append value is important:
386 - True: conflicting records count values are updated.
387 - False: conflicting records are replaced by new records.
388 @type append: bool
389 @param createIndex:
390 Indicate weither the database table indexes should be created.
391 @type createIndex: bool
392 @param callback:
393 The callback is used to show the progress percentage. In the gui a
394 callback method is implemented to update a progress bar showing the
395 n-grams insertion progress (cf. py).
396 @type callback: fun(float, ...)
397 """
398 progress = 0
399 sql = SqliteDatabaseConnector(outfile, n)
400 sql.crt_ngram_table(n)
401 for ngram, count in ngramMap.items():
402 if append:
403 oldCount = sql.ngram_count(ngram)
404 if oldCount > 0:
405 sql.update_ngram(ngram, oldCount + count)
406 else:
407 sql.insert_ngram(ngram, count)
408 else:
409 sql.remove_ngram(ngram)
410 sql.insert_ngram(ngram, count)
411 progress += 100 / len(ngramMap)
412 if callback:
413 callback(progress)
414 sql.commit()
415 if createIndex and not append:
416 sql.crt_index(n)
417 sql.close_database()
418