1
2
3
4 """The classes used for data mining.
5
6 Curently there is two data miners:
7 - Corpus miner: mine from text files.
8 - Facebook miner: mine from a facebook profile. In fact, this miner write
9 the profile's posts into a file and then the file is mines the same way as
10 a text corpus. That's why the two miners classes subclass the TextMiner
11 class.
12
13 @todo 0.1.0:
14 Implement the twitter miner which should be very similar to the facebook
15 miner.
16 """
17
18 try:
19 import configparser
20 except ImportError:
21 import ConfigParser as configparser
22 import tknz
23 import db
24 import os
25 import abc
26 import urllib.request as urllib2
27 import json
28 import re
29 import requests
30 import time
31 import datetime
32 from lg import lg
33
34
35
36 FB_CREATION_TIME = 1075852860
37
38
39
40
41
42
43 -class GraphAPI(object):
44 """A client for the Facebook Graph API.
45 See http://developers.facebook.com/docs/api for complete
46 documentation for the API.
47 The Graph API is made up of the objects in Facebook (e.g., people,
48 pages, events, photos) and the connections between them (e.g.,
49 friends, photo tags, and event RSVPs). This client provides access
50 to those primitive types in a generic way. For example, given an
51 OAuth access token, this will fetch the profile of the active user
52 and the list of the user's friends:
53 graph = facebook.GraphAPI(access_token)
54 user = graph.get_object("me")
55 friends = graph.get_connections(user["id"], "friends")
56 You can see a list of all of the objects and connections supported
57 by the API at http://developers.facebook.com/docs/reference/api/.
58 You can obtain an access token via OAuth or by using the Facebook
59 JavaScript SDK. See
60 http://developers.facebook.com/docs/authentication/ for details.
61 If you are using the JavaScript SDK, you can use the
62 get_user_from_cookie() method below to get the OAuth access token
63 for the active user from the cookie saved by the SDK.
64 """
65
66 - def __init__(self, access_token=None, timeout=None, version=None):
67
68 default_version = "2.0"
69 valid_API_versions = ["2.0", "2.1", "2.2", "2.3"]
70
71 self.access_token = access_token
72 self.timeout = timeout
73
74 if version:
75 version_regex = re.compile("^\d\.\d$")
76 match = version_regex.search(str(version))
77 if match is not None:
78 if str(version) not in valid_API_versions:
79 raise GraphAPIError("Valid API versions are " +
80 str(valid_API_versions).strip('[]'))
81 else:
82 self.version = "v" + str(version)
83 else:
84 raise GraphAPIError("Version number should be in the"
85 " following format: #.# (e.g. 2.0).")
86 else:
87 self.version = "v" + default_version
88
90 """Fetchs the given object from the graph."""
91 return self.request(self.version + "/" + id, args)
92
94 """Fetchs all of the given object from the graph.
95 We return a map from ID to object. If any of the IDs are
96 invalid, we raise an exception.
97 """
98 args["ids"] = ",".join(ids)
99 return self.request(self.version + "/", args)
100
102 """Fetchs the connections for given object."""
103 return self.request(
104 self.version + "/" + id + "/" + connection_name, args)
105
106 - def put_object(self, parent_object, connection_name, **data):
107 """Writes the given object to the graph, connected to the given parent.
108 For example,
109 graph.put_object("me", "feed", message="Hello, world")
110 writes "Hello, world" to the active user's wall. Likewise, this
111 will comment on a the first post of the active user's feed:
112 feed = graph.get_connections("me", "feed")
113 post = feed["data"][0]
114 graph.put_object(post["id"], "comments", message="First!")
115 See http://developers.facebook.com/docs/api#publishing for all
116 of the supported writeable objects.
117 Certain write operations require extended permissions. For
118 example, publishing to a user's feed requires the
119 "publish_actions" permission. See
120 http://developers.facebook.com/docs/publishing/ for details
121 about publishing permissions.
122 """
123 assert self.access_token, "Write operations require an access token"
124 return self.request(
125 self.version + "/" + parent_object + "/" + connection_name,
126 post_args=data,
127 method="POST")
128
129 - def put_wall_post(self, message, attachment={}, profile_id="me"):
130 """Writes a wall post to the given profile's wall.
131 We default to writing to the authenticated user's wall if no
132 profile_id is specified.
133 attachment adds a structured attachment to the status message
134 being posted to the Wall. It should be a dictionary of the form:
135 {"name": "Link name"
136 "link": "http://www.example.com/",
137 "caption": "{*actor*} posted a new review",
138 "description": "This is a longer description of the attachment",
139 "picture": "http://www.example.com/thumbnail.jpg"}
140 """
141 return self.put_object(profile_id, "feed", message=message,
142 **attachment)
143
147
149 """Likes the given post."""
150 return self.put_object(object_id, "likes")
151
153 """Deletes the object with the given ID from the graph."""
154 self.request(self.version + "/" + id, method="DELETE")
155
157 """Deletes the Request with the given ID for the given user."""
158 self.request("%s_%s" % (request_id, user_id), method="DELETE")
159
160 - def put_photo(self, image, album_path="me/photos", **kwargs):
161 """
162 Upload an image using multipart/form-data.
163 image - A file object representing the image to be uploaded.
164 album_path - A path representing where the image should be uploaded.
165 """
166 return self.request(
167 self.version + "/" + album_path,
168 post_args=kwargs,
169 files={"source": image},
170 method="POST")
171
173 """Fetches the current version number of the Graph API being used."""
174 args = {"access_token": self.access_token}
175 try:
176 response = requests.request("GET",
177 "https://graph.facebook.com/" +
178 self.version + "/me",
179 params=args,
180 timeout=self.timeout)
181 except requests.HTTPError as e:
182 response = json.loads(e.read())
183 raise GraphAPIError(response)
184
185 try:
186 headers = response.headers
187 version = headers["facebook-api-version"].replace("v", "")
188 return float(version)
189 except Exception:
190 raise GraphAPIError("API version number not available")
191
192 - def request(
193 self, path, args=None, post_args=None, files=None, method=None):
194 """Fetches the given path in the Graph API.
195 We translate args to a valid query string. If post_args is
196 given, we send a POST request to the given path with the given
197 arguments.
198 """
199 args = args or {}
200
201 if post_args is not None:
202 method = "POST"
203
204 if self.access_token:
205 if post_args is not None:
206 post_args["access_token"] = self.access_token
207 else:
208 args["access_token"] = self.access_token
209
210 try:
211 response = requests.request(method or "GET",
212 "https://graph.facebook.com/" +
213 path,
214 timeout=self.timeout,
215 params=args,
216 data=post_args,
217 files=files)
218 except requests.HTTPError as e:
219 response = json.loads(e.read())
220 raise GraphAPIError(response)
221
222 headers = response.headers
223 if 'json' in headers['content-type']:
224 result = response.json()
225 elif 'image/' in headers['content-type']:
226 mimetype = headers['content-type']
227 result = {"data": response.content,
228 "mime-type": mimetype,
229 "url": response.url}
230 elif "access_token" in parse_qs(response.text):
231 query_str = parse_qs(response.text)
232 if "access_token" in query_str:
233 result = {"access_token": query_str["access_token"][0]}
234 if "expires" in query_str:
235 result["expires"] = query_str["expires"][0]
236 else:
237 raise GraphAPIError(response.json())
238 else:
239 raise GraphAPIError('Maintype was not text, image, or querystring')
240
241 if result and isinstance(result, dict) and result.get("error"):
242 raise GraphAPIError(result)
243 return result
244
245 - def fql(self, query):
246 """FQL query.
247 Example query: "SELECT affiliations FROM user WHERE uid = me()"
248 """
249 return self.request(self.version + "/" + "fql", {"q": query})
250
252 """Get the application's access token as a string."""
253 args = {'grant_type': 'client_credentials',
254 'client_id': app_id,
255 'client_secret': app_secret}
256
257 return self.request("oauth/access_token", args=args)["access_token"]
258
261 """Get an access token from the "code" returned from an OAuth dialog.
262 Returns a dict containing the user-specific access token and its
263 expiration date (if applicable).
264 """
265 args = {
266 "code": code,
267 "redirect_uri": redirect_uri,
268 "client_id": app_id,
269 "client_secret": app_secret}
270
271 return self.request("oauth/access_token", args)
272
274 """
275 Extends the expiration time of a valid OAuth access token. See
276 <https://developers.facebook.com/roadmap/offline-access-removal/
277 #extend_token>
278 """
279 args = {
280 "client_id": app_id,
281 "client_secret": app_secret,
282 "grant_type": "fb_exchange_token",
283 "fb_exchange_token": self.access_token,
284 }
285
286 return self.request("oauth/access_token", args=args)
287
290 """List every "miner" classes to be used.
291
292 MinerRegistry gather every miners to be used according to the configuration
293 file. It provide methods to mine corpuses from different sources, using
294 different processing strategies and store the results in different outputs.
295
296 G{classtree MinerRegistry}
297 """
298
300 """ Constructor of the MinerRegistry class.
301
302 @param config:
303 The configuration file. It is used to retrieve the miners classes
304 names that will be added to the MinerRegistry.
305 @type config: L{drvr.Configuration}
306 """
307 self.config = config
308 self.set_miners()
309
311 """Add miner class(es) to the list according to the configuration.
312
313 Retrieve the miners classes names from the configuration and try to
314 add them to the list.
315 """
316 self[:] = []
317 for miner in self.config.getas('MinerRegistry', 'miners', 'list'):
318 self.add_miner(miner)
319
321 """Try to add a miner class to the list using its name.
322
323 Get the miner class name from the configuration and create an instance
324 of this class if it exists, then, add the instance to the list.
325
326 @param minerName:
327 The name of the miner. It must correspond to a section of the
328 configuration so that its miner class can be retrieved.
329 @type minerName: str
330 """
331 try:
332 minerClass = self.config[minerName]['class']
333 except KeyError:
334 print("Config file is incorrect. Check the miner key of the"
335 " %s section if exists or create it." % (minerName))
336 if minerClass == 'CorpusMiner':
337 miner = CorpusMiner(self.config, minerName)
338 elif minerClass == 'FacebookMiner':
339 miner = CorpusMiner(self.config, minerName)
340 else:
341 print('WARNING: miner class "%s" is unknown. miner won\'t be'
342 'added to the registry.' % minerName)
343 miner = None
344 if miner:
345 self.append(miner)
346
348 """Use the miner instances list to mine the sources.
349
350 Loop through every miner instances of the list and call their mine()
351 method to perform their mining operation.
352 """
353 for miner in self:
354 miner.mine()
355
357 """Close database of every miner instances using a database."""
358 for miner in self:
359 if callable(getattr(miner, "close_database", None)):
360 miner.close_database()
361
364 """Abstract class for all miners.
365
366 G{classtree Miner}
367 """
368
369 __metaclass__ = abc.ABCMeta
370
371 - def __init__(self, config=None, minerName='None', callback=None):
372 self.config = config
373 self.name = minerName
374 self.callback = callback
375 self.dbFile = self.config.getas(self.name, 'dbfilename')
376
378 """Remove the database file (call os.system)."""
379 os.system("rm %s" % (self.dbFile))
380
381 @abc.abstractmethod
383 raise NotImplementedError("Method must be implemented")
384
385
386 -class TextMiner(Miner):
387 """The miner for text files.
388
389 This miner mines text files by extracting valid n-grams from them and
390 inserting them in databases. Mining a text require:
391 - Tokenizing the text.
392 - Extracting n-grams.
393 - Inserting n-grams in a database in a special way.
394
395 @see: L{tknz.TextTokenizer}, L{db.insert_ngrams}
396
397 G{classtree TextMiner}
398 """
399
400 - def __init__(self, config, minerName, callback=None):
401 """Constructor of the FacebookMiner class.
402
403 @param config:
404 The configuration file. It is used to retrieve the miner parameters.
405 @type config: L{drvr.Configuration}
406 @param minerName:
407 The name of the miner.
408 @type minerName: str
409 @param callback:
410 The callback is used to show the progress percentage. In the gui a
411 callback method is implemented to update a progress bar showing the
412 n-grams insertion progress (cf. gui.py).
413 @type callback: fun(float, ...)
414 """
415 super().__init__(config, minerName, callback)
416 self.lowercase = self.config.getas(self.name, 'lowercase')
417 self.n = self.config.getas(self.name, 'n', 'int')
418
419 - def update_db(self, textPath):
420 """Mine a text file, updating the database.
421
422 @param textPath:
423 The path to the text file to mine.
424 @type textPath: str
425 """
426 for i in range(1, self.n + 1):
427 self.add_to_db(self.crt_ngram_map(textPath, i), i, True)
428
429 - def crt_new_db(self, textPath):
430 """Mine a text file.
431
432 This method dosen't try to update the n-grams counts so it will fail if
433 it tries to add an n-gram which is already in the database but this
434 method is a little faster than update_db().
435
436 @note: If you're intending to create a new database but it already
437 exists please consider calling rm_db() first.
438
439 @param textPath:
440 The path to the text file to mine.
441 @type textPath: str
442 """
443 for i in range(1, self.n + 1):
444 self.add_to_db(self.crt_ngram_map(textPath, i), i, False)
445
446 - def crt_ngram_map(self, textPath, n):
447 """Create a n-gram dictionary from a file.
448
449 @param textPath:
450 The path to the text file to mine.
451 @type textPath: str
452 @param n:
453 The n in n-gram. Specify the maximum size of the n-grams to
454 generate.
455 @type n: int
456
457 @return:
458 The n-gram dictionary.
459 @rtype: dict
460 """
461 lg.info("Parsing " + str(n) + "-grams from " + textPath)
462 self.callback(0, 'parsing ' + str(n) + '-grams from ' + textPath)
463 self.tokenizer = tknz.TextTokenizer(
464 textPath, n, self.lowercase, 0, self.callback)
465 ngramMap = self.tokenizer.tknize_text()
466 lg.info(
467 str(len(ngramMap)) + ' ngrams have been extracted from ' + textPath)
468 return ngramMap
469
470 - def add_to_db(self, ngramMap, n, append=False):
471 """Add n-grams of an n-gram dictionary to the database.
472
473 @param ngramMap:
474 The n-gram dictionnary returned by tknz.TextTokenizer.tknize_text().
475 See the above-mentioned method docstring for more information.
476 @type ngramMap: dict
477 @param n:
478 The n in n-gram. Specify the maximum size of the n-grams to
479 generate.
480 @type n: int
481 @param append:
482 Indicate weither the n-grams should be appened to the database.
483 @type append: bool
484 """
485 lg.info("Writing result to " + self.dbFile)
486 self.callback(0, 'writing ' + str(n) + '-grams to ' + self.dbFile)
487 db.insert_ngrams(ngramMap, n, self.dbFile, append, True, self.callback)
488 lg.info('n-grams successfully added to the database')
489
492 """The miner for text corpus.
493
494 This miner is basically a L{minr.TextMiner} wrapper that implement the
495 mine() method which merely loops on every files of the corpus and call the
496 L{minr.TextMiner.update_db} method to effectively do the mining operation.
497
498 G{classtree CorpusMiner}
499 """
500
501 - def __init__(self, config, minerName, callback=None):
502 """Constructor of the CorpusMiner class.
503
504 @param config:
505 The configuration file. It is used to retrieve the miner parameters.
506 @type config: L{drvr.Configuration}
507 @param minerName:
508 The name of the miner.
509 @type minerName: str
510 @param callback:
511 The callback is used to show the progress percentage. In the gui a
512 callback method is implemented to update a progress bar showing the
513 n-grams insertion progress (cf. gui.py).
514 @type callback: fun(float, ...)
515 """
516 super().__init__(config, minerName, callback)
517 self.callback = callback
518 self.corpusFiles = self.config.getas(self.name, 'texts', 'list')
519
521 """Perform the mining operation."""
522 for text in self.corpusFiles:
523 self.update_db(text)
524 self.callback(100, 'Done')
525
528 """A miner to mine dictionary-like files.
529
530 This miner isn't a real miner as it only extract words from a
531 dictionary-like file and insert them into a database.
532 A dictionnary-like file is a file listing words, one word per line::
533 about
534 army
535 bath
536 boat
537 ...
538
539 G{classtree DictMiner}
540 """
541
542 - def __init__(self, config, minerName, callback=None):
543 """Constructor of the DictMiner class.
544
545 @param config:
546 The configuration file. It is used to retrieve the miner parameters.
547 @type config: L{drvr.Configuration}
548 @param minerName:
549 The name of the miner.
550 @type minerName: str
551 @param callback:
552 The callback is used to show the progress percentage. In the gui a
553 callback method is implemented to update a progress bar showing the
554 n-grams insertion progress (cf. gui.py).
555 @type callback: fun(float, ...)
556 """
557 super().__init__(config, minerName, callback)
558 self.dictFile = self.config.getas(self.nale, 'dictionary')
559 self.database = self.config.getas(self.name, 'dbfilename')
560
562 """Perform the mining operation.
563
564 @note: This method could have used the update_db() method like the C
565 orpusMiner and FbMiner do but this method avoid useless operations
566 and is, therefore, faster.
567
568 @todo 0.0.2:
569 Make sure every lines of the file contain one single word (or none).
570 """
571 progress = 0
572 sql = db.SqliteDatabaseConnector(self.database)
573 sql.crt_ngram_table()
574 with open(self.dictFile) as dictFile:
575 for i, l in enumerate(dictFile):
576 pass
577 noLines = i + 1
578 with open(self.dictFile) as dictFile:
579 for word in dictFile:
580 ngram = [word.strip('\n').lower()]
581 oldCount = sql.ngram_count(ngram)
582 if oldCount > 0:
583 sql.update_ngram(ngram, oldCount + 1)
584 else:
585 sql.insert_ngram(ngram, 1)
586 progress += 100 / noLines
587 if self.callback:
588 self.callback(progress)
589 sql.commit()
590 sql.crt_index(1)
591 sql.close_database()
592
615
618 """The Facebook user profile miner.
619
620 This miner use an access token to access a user facebook wall and retrieve
621 its text message. While the messages's text is retrieve, the miner write
622 them into a txt file. Once every messages have been write to the file the
623 miner generate n-grams from the file and insert them in the database, using
624 the L{minr.TextMiner} methods.
625
626 @note: The miner do not retrieve every facebook wall messages each time.
627 When mining a facebook wall he saves the published date of the latest
628 message and on next mining it will only retrieved the messages that have
629 been published AFTER the saved date.
630 See: L{minr.FacebookMiner.update_fb}
631
632 @todo 0.2.0:
633 Create web app in order to log the user to facebook and twitter,
634 authenticate them and ask for permissions (if needed) and finaly get
635 access token.
636
637 G{classtree FacebookMiner}
638 """
639
640 - def __init__(self, config, minerName, callback=None):
641 """Constructor of the FacebookMiner class.
642
643 @param config:
644 The configuration file. It is used to retrieve the miner parameters.
645 @type config: L{drvr.Configuration}
646 @param minerName:
647 The name of the miner.
648 @type minerName: str
649 @param callback:
650 The callback is used to show the progress percentage. In the gui a
651 callback method is implemented to update a progress bar showing the
652 n-grams insertion progress (cf. gui.py).
653 @type callback: fun(float, ...)
654 """
655 super().__init__(config, minerName, callback)
656 self.fbFile = 'fb.txt'
657 self.callback = callback
658 self.accessToken = self.config.getas(self.name, 'accesstoken')
659 self.previousLast = self.config.getas(self.name, 'last_update', 'int')
660
662 """Perform the mining operation."""
663 if not self.accessToken is None:
664 try:
665 jsonText = self.get_user_details()
666 jsonText = json.loads(jsonText)
667 userInfo = json.loads(jsonText)
668 except TypeError:
669 self.callback(0, 'error: Faebook access token is invalid')
670 lg.error('Facebook access token is invalid')
671 return
672 else:
673 self.callback(0, 'error: Faebook access token is missing')
674 lg.error('Facebook access token is missing')
675 return
676 self.callback(0, 'Mining facebook user posts')
677 PAGE_ID = userInfo['id']
678 graph = GraphAPI(self.accessToken)
679 profile = graph.get_object(PAGE_ID)
680 posts = graph.get_connections(profile['id'], 'posts')
681 self.update_fb(posts)
682 self.update_db(self.fbFile)
683 self.callback(100, 'Done')
684
686 """Write a facebook post message to a file.
687
688 @param post:
689 A facebook post is a dictionary. If the post contains a textual
690 message then it is associated to the 'message' key.
691 @type post: dict
692 @param fo:
693 The file to write in.
694 @type fo: TextIOWrapper
695 """
696 if "message" in post:
697 fo.write(post['message'].encode('utf-8') + "\n".encode('ascii'))
698
700 """Fully mine every posts of a facebook profile.
701
702 Mine the posts contained in the "posts" parameter (which should contains
703 the latest posts) and keep requesting older posts until we reach the
704 last post. If a post contains a textual message it is automatically
705 written in the file at self.fbFile.
706 The latest post unix time is computed and written in the config.
707
708 @note: It is not possible to get every posts of a facebook profile in a
709 single request (except if the profile contains very few posts).So
710 The method must scan the "posts" dictionary and request the older
711 posts until the request return a posts dictionary.
712
713 @param posts:
714 "posts" is returned by GraphAPI.get_connections(), it contains
715 posts of a facebook profile.
716 @type posts: dict
717 """
718 fo = open(self.fbFile, "wb")
719 lastPostTime = ''
720 while True:
721 try:
722 if lastPostTime == '':
723 lastPostTime = time.mktime(datetime.datetime.strptime(
724 posts['data'][0]['created_time'],
725 "%Y-%m-%dT%H:%M:%S+0000").timetuple())
726 for post in posts['data']:
727 self.write_to_file(post, fo)
728 posts = requests.get(posts['paging']['next']).json()
729 except KeyError:
730 break
731 fo.close()
732 if lastPostTime:
733 self.config[self.name]['LAST_UPDATE'] = \
734 str(int(float(lastPostTime)))
735
737 """Mine posts of a facebook profile since the last mining operation.
738
739 Mine the posts contained in the "posts" parameter (which should contains
740 the latest posts) and keep requesting older posts until:
741 - We reach a post that has already be mined (comparison is carried
742 out using unix time and the 'last_update' config option)
743 - We reach the last post of the facebook profile.
744 If a post contains a textual message it is automatically written in
745 the file at self.fbFile.
746 The latest post unix time is computed and written in the config so that
747 we know which posts have been published after this one the next time
748 the method is called.
749
750 @note: It is not possible to get every posts of a facebook profile in a
751 single request (except if the profile contains very few posts).
752 So The method must scan the "posts" dictionary and request the
753 older posts until the request return a posts dictionary.
754
755 @param posts:
756 "posts" is returned by GraphAPI.get_connections(), it contains posts
757 of a facebook profile.
758 @type posts: dict
759 """
760 fo = open(self.fbFile, "wb")
761 lastPostTime = ''
762 stop = False
763 if self.previousLast:
764 previousLast = self.previousLast
765 else:
766 previousLast = FB_CREATION_TIME
767 while True:
768 try:
769 if lastPostTime == '':
770 lastPostTime = time.mktime(datetime.datetime.strptime(
771 posts['data'][0]['created_time'],
772 "%Y-%m-%dT%H:%M:%S+0000").timetuple())
773 diff = lastPostTime - previousLast
774 else:
775 try:
776
777 self.callback(float(100 * previousLast / time.mktime(
778 datetime.datetime.strptime(
779 posts['data'][0]['created_time'],
780 "%Y-%m-%dT%H:%M:%S+0000").timetuple())))
781 except IndexError:
782 pass
783 for post in posts['data']:
784 postTime = time.mktime(datetime.datetime.strptime(
785 post['created_time'],
786 "%Y-%m-%dT%H:%M:%S+0000").timetuple())
787 if postTime > previousLast:
788 self.write_to_file(post, fo)
789 else:
790 stop = True
791 break
792 if stop:
793 break
794 posts = requests.get(posts['paging']['next']).json()
795 except KeyError:
796 break
797 fo.close()
798 if lastPostTime:
799 self.config[self.name]['last_update'] = \
800 str(int(float(lastPostTime)))
801
803 """Use the facebook access token to get details about the user.
804
805 @return:
806 The user details or an empty dictionary if the request fail wich
807 probably means that the access token is invalid or outdated.
808 @rtype: dict
809 """
810 jDict = {}
811 url = "https://graph.facebook.com/me?access_token=" + self.accessToken
812 try:
813 response = urllib2.urlopen(urllib2.Request(url))
814 jDict = json.dumps(response.read().decode('utf-8'))
815 except Exception:
816 pass
817 return jDict
818
820 """Override the parent method.
821
822 This method delete the database file and also set the last_update option
823 of the facebook miner to the oldest value possible so that the facebook
824 account will be fully scraped on next mining operation.
825 """
826 os.system("rm %s" % (self.dbFile))
827 self.config['FbMiner']['last_update'] = str(FB_CREATION_TIME)
828