1
2
3
4 """The classes used for data mining.
5
6 Curently there is two data miners:
7 - Corpus miner: mine from text files.
8 - Facebook miner: mine from a facebook profile. In fact, this miner write
9 the profile's posts into a file and then the file is mines the same way as
10 a text corpus. That's why the two miners classes subclass the TextMiner
11 class.
12
13 @todo 0.1.0:
14 Implement the twitter miner which should be very similar to the facebook
15 miner.
16 """
17
18 from tipy.tknz import TextTokenizer
19 from tipy.db import SqliteDatabaseConnector, insert_ngrams
20 from os import system
21 from abc import ABCMeta, abstractmethod
22 try:
23 from urllib.request import Request, urlopen
24 except ImportError:
25 from urllib2 import Request, urlopen
26 from json import loads, dumps
27 from re import compile
28 from requests import request, HTTPError, get
29 from time import mktime
30 from datetime import datetime
31 from tipy.lg import lg
32
33
34 FB_CREATION_TIME = 1075852860
35 """@var: Facebook creation date in unix time."""
36
37
38
39
40
41
42 -class GraphAPI(object):
43 """A client for the Facebook Graph API.
44 See http://developers.facebook.com/docs/api for complete
45 documentation for the API.
46 The Graph API is made up of the objects in Facebook (e.g., people,
47 pages, events, photos) and the connections between them (e.g.,
48 friends, photo tags, and event RSVPs). This client provides access
49 to those primitive types in a generic way. For example, given an
50 OAuth access token, this will fetch the profile of the active user
51 and the list of the user's friends:
52 graph = facebook.GraphAPI(access_token)
53 user = graph.get_object("me")
54 friends = graph.get_connections(user["id"], "friends")
55 You can see a list of all of the objects and connections supported
56 by the API at http://developers.facebook.com/docs/reference/api/.
57 You can obtain an access token via OAuth or by using the Facebook
58 JavaScript SDK. See
59 http://developers.facebook.com/docs/authentication/ for details.
60 If you are using the JavaScript SDK, you can use the
61 get_user_from_cookie() method below to get the OAuth access token
62 for the active user from the cookie saved by the SDK.
63 """
64
65 - def __init__(self, access_token=None, timeout=None, version=None):
66
67 default_version = "2.0"
68 valid_API_versions = ["2.0", "2.1", "2.2", "2.3"]
69
70 self.access_token = access_token
71 self.timeout = timeout
72
73 if version:
74 version_regex = compile("^\d\.\d$")
75 match = version_regex.search(str(version))
76 if match is not None:
77 if str(version) not in valid_API_versions:
78 raise GraphAPIError("Valid API versions are " +
79 str(valid_API_versions).strip('[]'))
80 else:
81 self.version = "v" + str(version)
82 else:
83 raise GraphAPIError("Version number should be in the"
84 " following format: #.# (e.g. 2.0).")
85 else:
86 self.version = "v" + default_version
87
89 """Fetchs the given object from the graph."""
90 return self.request(self.version + "/" + id, args)
91
93 """Fetchs all of the given object from the graph.
94 We return a map from ID to object. If any of the IDs are
95 invalid, we raise an exception.
96 """
97 args["ids"] = ",".join(ids)
98 return self.request(self.version + "/", args)
99
101 """Fetchs the connections for given object."""
102 return self.request(
103 self.version + "/" + id + "/" + connection_name, args)
104
105 - def put_object(self, parent_object, connection_name, **data):
106 """Writes the given object to the graph, connected to the given parent.
107 For example,
108 graph.put_object("me", "feed", message="Hello, world")
109 writes "Hello, world" to the active user's wall. Likewise, this
110 will comment on a the first post of the active user's feed:
111 feed = graph.get_connections("me", "feed")
112 post = feed["data"][0]
113 graph.put_object(post["id"], "comments", message="First!")
114 See http://developers.facebook.com/docs/api#publishing for all
115 of the supported writeable objects.
116 Certain write operations require extended permissions. For
117 example, publishing to a user's feed requires the
118 "publish_actions" permission. See
119 http://developers.facebook.com/docs/publishing/ for details
120 about publishing permissions.
121 """
122 assert self.access_token, "Write operations require an access token"
123 return self.request(
124 self.version + "/" + parent_object + "/" + connection_name,
125 post_args=data,
126 method="POST")
127
128 - def put_wall_post(self, message, attachment={}, profile_id="me"):
129 """Writes a wall post to the given profile's wall.
130 We default to writing to the authenticated user's wall if no
131 profile_id is specified.
132 attachment adds a structured attachment to the status message
133 being posted to the Wall. It should be a dictionary of the form:
134 {"name": "Link name"
135 "link": "http://www.example.com/",
136 "caption": "{*actor*} posted a new review",
137 "description": "This is a longer description of the attachment",
138 "picture": "http://www.example.com/thumbnail.jpg"}
139 """
140 return self.put_object(profile_id, "feed", message=message,
141 **attachment)
142
146
148 """Likes the given post."""
149 return self.put_object(object_id, "likes")
150
152 """Deletes the object with the given ID from the graph."""
153 self.request(self.version + "/" + id, method="DELETE")
154
156 """Deletes the Request with the given ID for the given user."""
157 self.request("%s_%s" % (request_id, user_id), method="DELETE")
158
159 - def put_photo(self, image, album_path="me/photos", **kwargs):
160 """
161 Upload an image using multipart/form-data.
162 image - A file object representing the image to be uploaded.
163 album_path - A path representing where the image should be uploaded.
164 """
165 return self.request(
166 self.version + "/" + album_path,
167 post_args=kwargs,
168 files={"source": image},
169 method="POST")
170
172 """Fetches the current version number of the Graph API being used."""
173 args = {"access_token": self.access_token}
174 try:
175 response = request("GET",
176 "https://graph.facebook.com/" +
177 self.version + "/me",
178 params=args,
179 timeout=self.timeout)
180 except HTTPError as e:
181 response = loads(e.read())
182 raise GraphAPIError(response)
183
184 try:
185 headers = response.headers
186 version = headers["facebook-api-version"].replace("v", "")
187 return float(version)
188 except Exception:
189 raise GraphAPIError("API version number not available")
190
191 - def request(
192 self, path, args=None, post_args=None, files=None, method=None):
193 """Fetches the given path in the Graph API.
194 We translate args to a valid query string. If post_args is
195 given, we send a POST request to the given path with the given
196 arguments.
197 """
198 args = args or {}
199
200 if post_args is not None:
201 method = "POST"
202
203 if self.access_token:
204 if post_args is not None:
205 post_args["access_token"] = self.access_token
206 else:
207 args["access_token"] = self.access_token
208
209 try:
210 response = request(method or "GET",
211 "https://graph.facebook.com/" +
212 path,
213 timeout=self.timeout,
214 params=args,
215 data=post_args,
216 files=files)
217 except HTTPError as e:
218 response = loads(e.read())
219 raise GraphAPIError(response)
220
221 headers = response.headers
222 if 'json' in headers['content-type']:
223 result = response.json()
224 elif 'image/' in headers['content-type']:
225 mimetype = headers['content-type']
226 result = {"data": response.content,
227 "mime-type": mimetype,
228 "url": response.url}
229 elif "access_token" in parse_qs(response.text):
230 query_str = parse_qs(response.text)
231 if "access_token" in query_str:
232 result = {"access_token": query_str["access_token"][0]}
233 if "expires" in query_str:
234 result["expires"] = query_str["expires"][0]
235 else:
236 raise GraphAPIError(response.json())
237 else:
238 raise GraphAPIError('Maintype was not text, image, or querystring')
239
240 if result and isinstance(result, dict) and result.get("error"):
241 raise GraphAPIError(result)
242 return result
243
244 - def fql(self, query):
245 """FQL query.
246 Example query: "SELECT affiliations FROM user WHERE uid = me()"
247 """
248 return self.request(self.version + "/" + "fql", {"q": query})
249
251 """Get the application's access token as a string."""
252 args = {'grant_type': 'client_credentials',
253 'client_id': app_id,
254 'client_secret': app_secret}
255
256 return self.request("oauth/access_token", args=args)["access_token"]
257
260 """Get an access token from the "code" returned from an OAuth dialog.
261 Returns a dict containing the user-specific access token and its
262 expiration date (if applicable).
263 """
264 args = {
265 "code": code,
266 "redirect_uri": redirect_uri,
267 "client_id": app_id,
268 "client_secret": app_secret}
269
270 return self.request("oauth/access_token", args)
271
273 """
274 Extends the expiration time of a valid OAuth access token. See
275 <https://developers.facebook.com/roadmap/offline-access-removal/
276 #extend_token>
277 """
278 args = {
279 "client_id": app_id,
280 "client_secret": app_secret,
281 "grant_type": "fb_exchange_token",
282 "fb_exchange_token": self.access_token,
283 }
284
285 return self.request("oauth/access_token", args=args)
286
289 """List every "miner" classes to be used.
290
291 MinerRegistry gather every miners to be used according to the configuration
292 file. It provide methods to mine corpuses from different sources, using
293 different processing strategies and store the results in different outputs.
294
295 G{classtree MinerRegistry}
296 """
297
299 """ Constructor of the MinerRegistry class.
300
301 @param config:
302 The configuration file. It is used to retrieve the miners classes
303 names that will be added to the MinerRegistry.
304 @type config: L{drvr.Configuration}
305 """
306 self.config = config
307 self.set_miners()
308
310 """Add miner class(es) to the list according to the configuration.
311
312 Retrieve the miners classes names from the configuration and try to
313 add them to the list.
314 """
315 self[:] = []
316 for miner in self.config.getas('MinerRegistry', 'miners', 'list'):
317 self.add_miner(miner)
318
320 """Try to add a miner class to the list using its name.
321
322 Get the miner class name from the configuration and create an instance
323 of this class if it exists, then, add the instance to the list.
324
325 @param minerName:
326 The name of the miner. It must correspond to a section of the
327 configuration so that its miner class can be retrieved.
328 @type minerName: str
329 """
330 try:
331 minerClass = self.config[minerName]['class']
332 except KeyError:
333 print("Config file is incorrect. Check the miner key of the"
334 " %s section if exists or create it." % (minerName))
335 if minerClass == 'CorpusMiner':
336 miner = CorpusMiner(self.config, minerName)
337 elif minerClass == 'FacebookMiner':
338 miner = CorpusMiner(self.config, minerName)
339 else:
340 print('WARNING: miner class "%s" is unknown. miner won\'t be'
341 'added to the registry.' % minerName)
342 miner = None
343 if miner:
344 self.append(miner)
345
347 """Use the miner instances list to mine the sources.
348
349 Loop through every miner instances of the list and call their mine()
350 method to perform their mining operation.
351 """
352 for miner in self:
353 miner.mine()
354
356 """Close database of every miner instances using a database."""
357 for miner in self:
358 if callable(getattr(miner, "close_database", None)):
359 miner.close_database()
360
363 """Abstract class for all miners.
364
365 G{classtree Miner}
366 """
367
368 __metaclass__ = ABCMeta
369
370 - def __init__(self, config=None, minerName='None', callback=None):
371 self.config = config
372 self.name = minerName
373 self.callback = callback
374 self.dbFile = self.config.getas(self.name, 'database')
375
377 """Remove the database file (call os.system)."""
378 system("rm %s" % (self.dbFile))
379
380 @abstractmethod
382 raise NotImplementedError("Method must be implemented")
383
384
385 -class TextMiner(Miner):
386 """The miner for text files.
387
388 This miner mines text files by extracting valid n-grams from them and
389 inserting them in databases. Mining a text require:
390 - Tokenizing the text.
391 - Extracting n-grams.
392 - Inserting n-grams in a database in a special way.
393
394 @see: L{TextTokenizer}, L{db.insert_ngrams}
395
396 G{classtree TextMiner}
397 """
398
399 - def __init__(self, config, minerName, callback=None):
400 """Constructor of the FacebookMiner class.
401
402 @param config:
403 The configuration file. It is used to retrieve the miner parameters.
404 @type config: L{drvr.Configuration}
405 @param minerName:
406 The name of the miner.
407 @type minerName: str
408 @param callback:
409 The callback is used to show the progress percentage. In the gui a
410 callback method is implemented to update a progress bar showing the
411 n-grams insertion progress (cf. py).
412 @type callback: fun(float, ...)
413 """
414 super().__init__(config, minerName, callback)
415 self.lowercase = self.config.getas(self.name, 'lowercase')
416 self.n = self.config.getas(self.name, 'n', 'int')
417
418 - def update_db(self, textPath):
419 """Mine a text file, updating the database.
420
421 @param textPath:
422 The path to the text file to mine.
423 @type textPath: str
424 """
425 for i in range(1, self.n + 1):
426 self.add_to_db(self.crt_ngram_map(textPath, i), i, True)
427
428 - def crt_new_db(self, textPath):
429 """Mine a text file.
430
431 This method dosen't try to update the n-grams counts so it will fail if
432 it tries to add an n-gram which is already in the database but this
433 method is a little faster than update_db().
434
435 @note: If you're intending to create a new database but it already
436 exists please consider calling rm_db() first.
437
438 @param textPath:
439 The path to the text file to mine.
440 @type textPath: str
441 """
442 for i in range(1, self.n + 1):
443 self.add_to_db(self.crt_ngram_map(textPath, i), i, False)
444
445 - def crt_ngram_map(self, textPath, n):
446 """Create a n-gram dictionary from a file.
447
448 @param textPath:
449 The path to the text file to mine.
450 @type textPath: str
451 @param n:
452 The n in n-gram. Specify the maximum size of the n-grams to
453 generate.
454 @type n: int
455
456 @return:
457 The n-gram dictionary.
458 @rtype: dict
459 """
460 lg.info("Parsing " + str(n) + "-grams from " + textPath)
461 self.callback(0, 'parsing ' + str(n) + '-grams from ' + textPath)
462 self.tokenizer = TextTokenizer(
463 textPath, n, self.lowercase, 0, self.callback)
464 ngramMap = self.tokenizer.tknize_text()
465 lg.info(
466 str(len(ngramMap)) + ' ngrams have been extracted from ' + textPath)
467 return ngramMap
468
469 - def add_to_db(self, ngramMap, n, append=False):
470 """Add n-grams of an n-gram dictionary to the database.
471
472 @param ngramMap:
473 The n-gram dictionnary returned by TextTokenizer.tknize_text().
474 See the above-mentioned method docstring for more information.
475 @type ngramMap: dict
476 @param n:
477 The n in n-gram. Specify the maximum size of the n-grams to
478 generate.
479 @type n: int
480 @param append:
481 Indicate weither the n-grams should be appened to the database.
482 @type append: bool
483 """
484 lg.info("Writing result to " + self.dbFile)
485 self.callback(0, 'writing ' + str(n) + '-grams to ' + self.dbFile)
486 insert_ngrams(ngramMap, n, self.dbFile, append, True, self.callback)
487 lg.info('n-grams successfully added to the database')
488
491 """The miner for text corpus.
492
493 This miner is basically a L{minr.TextMiner} wrapper that implement the
494 mine() method which merely loops on every files of the corpus and call the
495 L{minr.TextMiner.update_db} method to effectively do the mining operation.
496
497 G{classtree CorpusMiner}
498 """
499
500 - def __init__(self, config, minerName, callback=None):
501 """Constructor of the CorpusMiner class.
502
503 @param config:
504 The configuration file. It is used to retrieve the miner parameters.
505 @type config: L{drvr.Configuration}
506 @param minerName:
507 The name of the miner.
508 @type minerName: str
509 @param callback:
510 The callback is used to show the progress percentage. In the gui a
511 callback method is implemented to update a progress bar showing the
512 n-grams insertion progress (cf. py).
513 @type callback: fun(float, ...)
514 """
515 super().__init__(config, minerName, callback)
516 self.callback = callback
517 self.corpusFiles = self.config.getas(self.name, 'texts', 'list')
518
520 """Perform the mining operation."""
521 for text in self.corpusFiles:
522 self.update_db(text)
523 self.callback(100, 'Done')
524
527 """A miner to mine dictionary-like files.
528
529 This miner isn't a real miner as it only extract words from a
530 dictionary-like file and insert them into a database.
531 A dictionnary-like file is a file listing words, one word per line::
532 about
533 army
534 bath
535 boat
536 ...
537
538 G{classtree DictMiner}
539 """
540
541 - def __init__(self, config, minerName, callback=None):
542 """Constructor of the DictMiner class.
543
544 @param config:
545 The configuration file. It is used to retrieve the miner parameters.
546 @type config: L{drvr.Configuration}
547 @param minerName:
548 The name of the miner.
549 @type minerName: str
550 @param callback:
551 The callback is used to show the progress percentage. In the gui a
552 callback method is implemented to update a progress bar showing the
553 n-grams insertion progress (cf. py).
554 @type callback: fun(float, ...)
555 """
556 super().__init__(config, minerName, callback)
557 self.dictFile = self.config.getas(self.name, 'dictionary')
558 self.database = self.config.getas(self.name, 'database')
559
561 """Perform the mining operation.
562
563 @note: This method could have used the update_db() method like the C
564 orpusMiner and FbMiner do but this method avoid useless operations
565 and is, therefore, faster.
566
567 @todo 0.0.9:
568 Make sure every lines of the file contain one single word (or none).
569 """
570 progress = 0
571 sql = SqliteDatabaseConnector(self.database)
572 sql.crt_ngram_table()
573 with open(self.dictFile) as dictFile:
574 for i, l in enumerate(dictFile):
575 pass
576 noLines = i + 1
577 with open(self.dictFile) as dictFile:
578 for word in dictFile:
579 ngram = [word.strip('\n').lower()]
580 oldCount = sql.ngram_count(ngram)
581 if oldCount > 0:
582 sql.update_ngram(ngram, oldCount + 1)
583 else:
584 sql.insert_ngram(ngram, 1)
585 progress += 100 / noLines
586 if self.callback:
587 self.callback(progress)
588 sql.commit()
589 sql.crt_index(1)
590 sql.close_database()
591
614
617 """The Facebook user profile miner.
618
619 This miner use an access token to access a user facebook wall and retrieve
620 its text message. While the messages's text is retrieve, the miner write
621 them into a txt file. Once every messages have been write to the file the
622 miner generate n-grams from the file and insert them in the database, using
623 the L{minr.TextMiner} methods.
624
625 G{classtree FacebookMiner}
626
627 @note: The miner do not retrieve every facebook wall messages each time.
628 When mining a facebook wall he saves the published date of the latest
629 message and on next mining it will only retrieved the messages that have
630 been published AFTER the saved date.
631 See: L{minr.FacebookMiner.update_fb}
632
633 @todo 0.2.0:
634 Create web app in order to log the user to facebook and twitter,
635 authenticate them and ask for permissions (if needed) and finaly get
636 access token.
637 """
638
639 - def __init__(self, config, minerName, callback=None):
640 """Constructor of the FacebookMiner class.
641
642 @param config:
643 The configuration file. It is used to retrieve the miner parameters.
644 @type config: L{drvr.Configuration}
645 @param minerName:
646 The name of the miner.
647 @type minerName: str
648 @param callback:
649 The callback is used to show the progress percentage. In the gui a
650 callback method is implemented to update a progress bar showing the
651 n-grams insertion progress (cf. py).
652 @type callback: fun(float, ...)
653 """
654 super().__init__(config, minerName, callback)
655 self.fbFile = 'fb.txt'
656 self.callback = callback
657 self.accessToken = self.config.getas(self.name, 'accesstoken')
658 self.previousLast = self.config.getas(self.name, 'last_update', 'int')
659
661 """Perform the mining operation."""
662 if not self.accessToken is None:
663 try:
664 jsonText = self.get_user_details()
665 jsonText = loads(jsonText)
666 userInfo = loads(jsonText)
667 except TypeError:
668 self.callback(0, 'error: Faebook access token is invalid')
669 lg.error('Facebook access token is invalid')
670 return
671 else:
672 self.callback(0, 'error: Faebook access token is missing')
673 lg.error('Facebook access token is missing')
674 return
675 self.callback(0, 'Mining facebook user posts')
676 PAGE_ID = userInfo['id']
677 graph = GraphAPI(self.accessToken)
678 profile = graph.get_object(PAGE_ID)
679 posts = graph.get_connections(profile['id'], 'posts')
680 self.update_fb(posts)
681 self.update_db(self.fbFile)
682 self.callback(100, 'Done')
683
685 """Write a facebook post message to a file.
686
687 @param post:
688 A facebook post is a dictionary. If the post contains a textual
689 message then it is associated to the 'message' key.
690 @type post: dict
691 @param fo:
692 The file to write in.
693 @type fo: TextIOWrapper
694 """
695 if "message" in post:
696 fo.write(post['message'].encode('utf-8') + "\n".encode('ascii'))
697
699 """Fully mine every posts of a facebook profile.
700
701 Mine the posts contained in the "posts" parameter (which should contains
702 the latest posts) and keep requesting older posts until we reach the
703 last post. If a post contains a textual message it is automatically
704 written in the file at self.fbFile.
705 The latest post unix time is computed and written in the config.
706
707 @note: It is not possible to get every posts of a facebook profile in a
708 single request (except if the profile contains very few posts).So
709 The method must scan the "posts" dictionary and request the older
710 posts until the request return a posts dictionary.
711
712 @param posts:
713 "posts" is returned by GraphAPI.get_connections(), it contains
714 posts of a facebook profile.
715 @type posts: dict
716 """
717 fo = open(self.fbFile, "wb")
718 lastPostTime = ''
719 while True:
720 try:
721 if lastPostTime == '':
722 lastPostTime = mktime(datetime.strptime(
723 posts['data'][0]['created_time'],
724 "%Y-%m-%dT%H:%M:%S+0000").timetuple())
725 for post in posts['data']:
726 self.write_to_file(post, fo)
727 posts = get(posts['paging']['next']).json()
728 except KeyError:
729 break
730 fo.close()
731 if lastPostTime:
732 self.config[self.name]['LAST_UPDATE'] = \
733 str(int(float(lastPostTime)))
734
736 """Mine posts of a facebook profile since the last mining operation.
737
738 Mine the posts contained in the "posts" parameter (which should contains
739 the latest posts) and keep requesting older posts until:
740 - We reach a post that has already be mined (comparison is carried
741 out using unix time and the 'last_update' config option)
742 - We reach the last post of the facebook profile.
743 If a post contains a textual message it is automatically written in
744 the file at self.fbFile.
745 The latest post unix time is computed and written in the config so that
746 we know which posts have been published after this one the next time
747 the method is called.
748
749 @note: It is not possible to get every posts of a facebook profile in a
750 single request (except if the profile contains very few posts).
751 So The method must scan the "posts" dictionary and request the
752 older posts until the request return a posts dictionary.
753
754 @param posts:
755 "posts" is returned by GraphAPI.get_connections(), it contains posts
756 of a facebook profile.
757 @type posts: dict
758 """
759 fo = open(self.fbFile, "wb")
760 lastPostTime = ''
761 stop = False
762 if self.previousLast:
763 previousLast = self.previousLast
764 else:
765 previousLast = FB_CREATION_TIME
766 while True:
767 try:
768 if lastPostTime == '':
769 lastPostTime = mktime(datetime.strptime(
770 posts['data'][0]['created_time'],
771 "%Y-%m-%dT%H:%M:%S+0000").timetuple())
772 diff = lastPostTime - previousLast
773 else:
774 try:
775
776 self.callback(float(100 * previousLast / mktime(
777 datetime.strptime(
778 posts['data'][0]['created_time'],
779 "%Y-%m-%dT%H:%M:%S+0000").timetuple())))
780 except IndexError:
781 pass
782 for post in posts['data']:
783 postTime = mktime(datetime.strptime(
784 post['created_time'],
785 "%Y-%m-%dT%H:%M:%S+0000").timetuple())
786 if postTime > previousLast:
787 self.write_to_file(post, fo)
788 else:
789 stop = True
790 break
791 if stop:
792 break
793 posts = get(posts['paging']['next']).json()
794 except KeyError:
795 break
796 fo.close()
797 if lastPostTime:
798 self.config[self.name]['last_update'] = \
799 str(int(float(lastPostTime)))
800
802 """Use the facebook access token to get details about the user.
803
804 @return:
805 The user details or an empty dictionary if the request fail wich
806 probably means that the access token is invalid or outdated.
807 @rtype: dict
808 """
809 jDict = {}
810 url = "https://graph.facebook.com/me?access_token=" + self.accessToken
811 try:
812 response = urlopen(Request(url))
813 jDict = dumps(response.read().decode('utf-8'))
814 except Exception:
815 pass
816 return jDict
817
819 """Override the parent method.
820
821 This method delete the database file and also set the last_update option
822 of the facebook miner to the oldest value possible so that the facebook
823 account will be fully scraped on next mining operation.
824 """
825 system("rm %s" % (self.dbFile))
826 self.config['FbMiner']['last_update'] = str(FB_CREATION_TIME)
827