Package tipy :: Module minr
[hide private]
[frames] | no frames]

Source Code for Module tipy.minr

  1  #!/usr/bin/env python3 
  2  # -*- coding: utf-8 -*- 
  3   
  4  """The classes used for data mining. 
  5   
  6  Curently there is two data miners: 
  7      - Corpus miner: mine from text files. 
  8      - Facebook miner: mine from a facebook profile. In fact, this miner write 
  9        the profile's posts into a file and then the file is mines the same way as 
 10        a text corpus. That's why the two miners classes subclass the TextMiner 
 11        class. 
 12   
 13  @todo 0.1.0: 
 14      Implement the twitter miner which should be very similar to the facebook 
 15      miner. 
 16  """ 
 17   
 18  from tipy.tknz import TextTokenizer 
 19  from tipy.db import SqliteDatabaseConnector, insert_ngrams 
 20  from os import system 
 21  from abc import ABCMeta, abstractmethod 
 22  try: 
 23      from urllib.request import Request, urlopen 
 24  except ImportError: 
 25      from urllib2 import Request, urlopen 
 26  from json import loads, dumps 
 27  from re import compile 
 28  from requests import request, HTTPError, get 
 29  from time import mktime 
 30  from datetime import datetime 
 31  from tipy.lg import lg 
 32   
 33   
 34  FB_CREATION_TIME = 1075852860 
 35  """@var: Facebook creation date in unix time.""" 
36 37 38 # This class is taken from the facebook python sdk package. 39 # The facebook python sdk can't be build for python 3 so I modify it to make it 40 # compatible. 41 # See: https://facebook-sdk.readthedocs.org/ 42 -class GraphAPI(object):
43 """A client for the Facebook Graph API. 44 See http://developers.facebook.com/docs/api for complete 45 documentation for the API. 46 The Graph API is made up of the objects in Facebook (e.g., people, 47 pages, events, photos) and the connections between them (e.g., 48 friends, photo tags, and event RSVPs). This client provides access 49 to those primitive types in a generic way. For example, given an 50 OAuth access token, this will fetch the profile of the active user 51 and the list of the user's friends: 52 graph = facebook.GraphAPI(access_token) 53 user = graph.get_object("me") 54 friends = graph.get_connections(user["id"], "friends") 55 You can see a list of all of the objects and connections supported 56 by the API at http://developers.facebook.com/docs/reference/api/. 57 You can obtain an access token via OAuth or by using the Facebook 58 JavaScript SDK. See 59 http://developers.facebook.com/docs/authentication/ for details. 60 If you are using the JavaScript SDK, you can use the 61 get_user_from_cookie() method below to get the OAuth access token 62 for the active user from the cookie saved by the SDK. 63 """ 64
65 - def __init__(self, access_token=None, timeout=None, version=None):
66 # The default version is only used if the version kwarg does not exist. 67 default_version = "2.0" 68 valid_API_versions = ["2.0", "2.1", "2.2", "2.3"] 69 70 self.access_token = access_token 71 self.timeout = timeout 72 73 if version: 74 version_regex = compile("^\d\.\d$") 75 match = version_regex.search(str(version)) 76 if match is not None: 77 if str(version) not in valid_API_versions: 78 raise GraphAPIError("Valid API versions are " + 79 str(valid_API_versions).strip('[]')) 80 else: 81 self.version = "v" + str(version) 82 else: 83 raise GraphAPIError("Version number should be in the" 84 " following format: #.# (e.g. 2.0).") 85 else: 86 self.version = "v" + default_version
87
88 - def get_object(self, id, **args):
89 """Fetchs the given object from the graph.""" 90 return self.request(self.version + "/" + id, args)
91
92 - def get_objects(self, ids, **args):
93 """Fetchs all of the given object from the graph. 94 We return a map from ID to object. If any of the IDs are 95 invalid, we raise an exception. 96 """ 97 args["ids"] = ",".join(ids) 98 return self.request(self.version + "/", args)
99
100 - def get_connections(self, id, connection_name, **args):
101 """Fetchs the connections for given object.""" 102 return self.request( 103 self.version + "/" + id + "/" + connection_name, args)
104
105 - def put_object(self, parent_object, connection_name, **data):
106 """Writes the given object to the graph, connected to the given parent. 107 For example, 108 graph.put_object("me", "feed", message="Hello, world") 109 writes "Hello, world" to the active user's wall. Likewise, this 110 will comment on a the first post of the active user's feed: 111 feed = graph.get_connections("me", "feed") 112 post = feed["data"][0] 113 graph.put_object(post["id"], "comments", message="First!") 114 See http://developers.facebook.com/docs/api#publishing for all 115 of the supported writeable objects. 116 Certain write operations require extended permissions. For 117 example, publishing to a user's feed requires the 118 "publish_actions" permission. See 119 http://developers.facebook.com/docs/publishing/ for details 120 about publishing permissions. 121 """ 122 assert self.access_token, "Write operations require an access token" 123 return self.request( 124 self.version + "/" + parent_object + "/" + connection_name, 125 post_args=data, 126 method="POST")
127
128 - def put_wall_post(self, message, attachment={}, profile_id="me"):
129 """Writes a wall post to the given profile's wall. 130 We default to writing to the authenticated user's wall if no 131 profile_id is specified. 132 attachment adds a structured attachment to the status message 133 being posted to the Wall. It should be a dictionary of the form: 134 {"name": "Link name" 135 "link": "http://www.example.com/", 136 "caption": "{*actor*} posted a new review", 137 "description": "This is a longer description of the attachment", 138 "picture": "http://www.example.com/thumbnail.jpg"} 139 """ 140 return self.put_object(profile_id, "feed", message=message, 141 **attachment)
142
143 - def put_comment(self, object_id, message):
144 """Writes the given comment on the given post.""" 145 return self.put_object(object_id, "comments", message=message)
146
147 - def put_like(self, object_id):
148 """Likes the given post.""" 149 return self.put_object(object_id, "likes")
150
151 - def delete_object(self, id):
152 """Deletes the object with the given ID from the graph.""" 153 self.request(self.version + "/" + id, method="DELETE")
154
155 - def delete_request(self, user_id, request_id):
156 """Deletes the Request with the given ID for the given user.""" 157 self.request("%s_%s" % (request_id, user_id), method="DELETE")
158
159 - def put_photo(self, image, album_path="me/photos", **kwargs):
160 """ 161 Upload an image using multipart/form-data. 162 image - A file object representing the image to be uploaded. 163 album_path - A path representing where the image should be uploaded. 164 """ 165 return self.request( 166 self.version + "/" + album_path, 167 post_args=kwargs, 168 files={"source": image}, 169 method="POST")
170
171 - def get_version(self):
172 """Fetches the current version number of the Graph API being used.""" 173 args = {"access_token": self.access_token} 174 try: 175 response = request("GET", 176 "https://graph.facebook.com/" + 177 self.version + "/me", 178 params=args, 179 timeout=self.timeout) 180 except HTTPError as e: 181 response = loads(e.read()) 182 raise GraphAPIError(response) 183 184 try: 185 headers = response.headers 186 version = headers["facebook-api-version"].replace("v", "") 187 return float(version) 188 except Exception: 189 raise GraphAPIError("API version number not available")
190
191 - def request( 192 self, path, args=None, post_args=None, files=None, method=None):
193 """Fetches the given path in the Graph API. 194 We translate args to a valid query string. If post_args is 195 given, we send a POST request to the given path with the given 196 arguments. 197 """ 198 args = args or {} 199 200 if post_args is not None: 201 method = "POST" 202 203 if self.access_token: 204 if post_args is not None: 205 post_args["access_token"] = self.access_token 206 else: 207 args["access_token"] = self.access_token 208 209 try: 210 response = request(method or "GET", 211 "https://graph.facebook.com/" + 212 path, 213 timeout=self.timeout, 214 params=args, 215 data=post_args, 216 files=files) 217 except HTTPError as e: 218 response = loads(e.read()) 219 raise GraphAPIError(response) 220 221 headers = response.headers 222 if 'json' in headers['content-type']: 223 result = response.json() 224 elif 'image/' in headers['content-type']: 225 mimetype = headers['content-type'] 226 result = {"data": response.content, 227 "mime-type": mimetype, 228 "url": response.url} 229 elif "access_token" in parse_qs(response.text): 230 query_str = parse_qs(response.text) 231 if "access_token" in query_str: 232 result = {"access_token": query_str["access_token"][0]} 233 if "expires" in query_str: 234 result["expires"] = query_str["expires"][0] 235 else: 236 raise GraphAPIError(response.json()) 237 else: 238 raise GraphAPIError('Maintype was not text, image, or querystring') 239 240 if result and isinstance(result, dict) and result.get("error"): 241 raise GraphAPIError(result) 242 return result
243
244 - def fql(self, query):
245 """FQL query. 246 Example query: "SELECT affiliations FROM user WHERE uid = me()" 247 """ 248 return self.request(self.version + "/" + "fql", {"q": query})
249
250 - def get_app_access_token(self, app_id, app_secret):
251 """Get the application's access token as a string.""" 252 args = {'grant_type': 'client_credentials', 253 'client_id': app_id, 254 'client_secret': app_secret} 255 256 return self.request("oauth/access_token", args=args)["access_token"]
257
258 - def get_access_token_from_code( 259 self, code, redirect_uri, app_id, app_secret):
260 """Get an access token from the "code" returned from an OAuth dialog. 261 Returns a dict containing the user-specific access token and its 262 expiration date (if applicable). 263 """ 264 args = { 265 "code": code, 266 "redirect_uri": redirect_uri, 267 "client_id": app_id, 268 "client_secret": app_secret} 269 270 return self.request("oauth/access_token", args)
271
272 - def extend_access_token(self, app_id, app_secret):
273 """ 274 Extends the expiration time of a valid OAuth access token. See 275 <https://developers.facebook.com/roadmap/offline-access-removal/ 276 #extend_token> 277 """ 278 args = { 279 "client_id": app_id, 280 "client_secret": app_secret, 281 "grant_type": "fb_exchange_token", 282 "fb_exchange_token": self.access_token, 283 } 284 285 return self.request("oauth/access_token", args=args)
286
287 288 -class MinerRegistry(list):
289 """List every "miner" classes to be used. 290 291 MinerRegistry gather every miners to be used according to the configuration 292 file. It provide methods to mine corpuses from different sources, using 293 different processing strategies and store the results in different outputs. 294 295 G{classtree MinerRegistry} 296 """ 297
298 - def __init__(self, config):
299 """ Constructor of the MinerRegistry class. 300 301 @param config: 302 The configuration file. It is used to retrieve the miners classes 303 names that will be added to the MinerRegistry. 304 @type config: L{drvr.Configuration} 305 """ 306 self.config = config 307 self.set_miners() # TODO: remove (never evaluated)
308
309 - def set_miners(self):
310 """Add miner class(es) to the list according to the configuration. 311 312 Retrieve the miners classes names from the configuration and try to 313 add them to the list. 314 """ 315 self[:] = [] 316 for miner in self.config.getas('MinerRegistry', 'miners', 'list'): 317 self.add_miner(miner)
318
319 - def add_miner(self, minerName):
320 """Try to add a miner class to the list using its name. 321 322 Get the miner class name from the configuration and create an instance 323 of this class if it exists, then, add the instance to the list. 324 325 @param minerName: 326 The name of the miner. It must correspond to a section of the 327 configuration so that its miner class can be retrieved. 328 @type minerName: str 329 """ 330 try: 331 minerClass = self.config[minerName]['class'] 332 except KeyError: 333 print("Config file is incorrect. Check the miner key of the" 334 " %s section if exists or create it." % (minerName)) 335 if minerClass == 'CorpusMiner': 336 miner = CorpusMiner(self.config, minerName) 337 elif minerClass == 'FacebookMiner': 338 miner = CorpusMiner(self.config, minerName) 339 else: 340 print('WARNING: miner class "%s" is unknown. miner won\'t be' 341 'added to the registry.' % minerName) 342 miner = None 343 if miner: 344 self.append(miner)
345
346 - def mine(self):
347 """Use the miner instances list to mine the sources. 348 349 Loop through every miner instances of the list and call their mine() 350 method to perform their mining operation. 351 """ 352 for miner in self: 353 miner.mine()
354
355 - def close_databases(self):
356 """Close database of every miner instances using a database.""" 357 for miner in self: 358 if callable(getattr(miner, "close_database", None)): 359 miner.close_database()
360
361 362 -class Miner(object):
363 """Abstract class for all miners. 364 365 G{classtree Miner} 366 """ 367 368 __metaclass__ = ABCMeta 369
370 - def __init__(self, config=None, minerName='None', callback=None):
371 self.config = config 372 self.name = minerName 373 self.callback = callback 374 self.dbFile = self.config.getas(self.name, 'database')
375
376 - def rm_db(self):
377 """Remove the database file (call os.system).""" 378 system("rm %s" % (self.dbFile))
379 380 @abstractmethod
381 - def mine(self):
382 raise NotImplementedError("Method must be implemented")
383
384 385 -class TextMiner(Miner):
386 """The miner for text files. 387 388 This miner mines text files by extracting valid n-grams from them and 389 inserting them in databases. Mining a text require: 390 - Tokenizing the text. 391 - Extracting n-grams. 392 - Inserting n-grams in a database in a special way. 393 394 @see: L{TextTokenizer}, L{db.insert_ngrams} 395 396 G{classtree TextMiner} 397 """ 398
399 - def __init__(self, config, minerName, callback=None):
400 """Constructor of the FacebookMiner class. 401 402 @param config: 403 The configuration file. It is used to retrieve the miner parameters. 404 @type config: L{drvr.Configuration} 405 @param minerName: 406 The name of the miner. 407 @type minerName: str 408 @param callback: 409 The callback is used to show the progress percentage. In the gui a 410 callback method is implemented to update a progress bar showing the 411 n-grams insertion progress (cf. py). 412 @type callback: fun(float, ...) 413 """ 414 super().__init__(config, minerName, callback) 415 self.lowercase = self.config.getas(self.name, 'lowercase') 416 self.n = self.config.getas(self.name, 'n', 'int')
417
418 - def update_db(self, textPath):
419 """Mine a text file, updating the database. 420 421 @param textPath: 422 The path to the text file to mine. 423 @type textPath: str 424 """ 425 for i in range(1, self.n + 1): 426 self.add_to_db(self.crt_ngram_map(textPath, i), i, True)
427
428 - def crt_new_db(self, textPath):
429 """Mine a text file. 430 431 This method dosen't try to update the n-grams counts so it will fail if 432 it tries to add an n-gram which is already in the database but this 433 method is a little faster than update_db(). 434 435 @note: If you're intending to create a new database but it already 436 exists please consider calling rm_db() first. 437 438 @param textPath: 439 The path to the text file to mine. 440 @type textPath: str 441 """ 442 for i in range(1, self.n + 1): 443 self.add_to_db(self.crt_ngram_map(textPath, i), i, False)
444
445 - def crt_ngram_map(self, textPath, n):
446 """Create a n-gram dictionary from a file. 447 448 @param textPath: 449 The path to the text file to mine. 450 @type textPath: str 451 @param n: 452 The n in n-gram. Specify the maximum size of the n-grams to 453 generate. 454 @type n: int 455 456 @return: 457 The n-gram dictionary. 458 @rtype: dict 459 """ 460 lg.info("Parsing " + str(n) + "-grams from " + textPath) 461 self.callback(0, 'parsing ' + str(n) + '-grams from ' + textPath) 462 self.tokenizer = TextTokenizer( 463 textPath, n, self.lowercase, 0, self.callback) 464 ngramMap = self.tokenizer.tknize_text() 465 lg.info( 466 str(len(ngramMap)) + ' ngrams have been extracted from ' + textPath) 467 return ngramMap
468
469 - def add_to_db(self, ngramMap, n, append=False):
470 """Add n-grams of an n-gram dictionary to the database. 471 472 @param ngramMap: 473 The n-gram dictionnary returned by TextTokenizer.tknize_text(). 474 See the above-mentioned method docstring for more information. 475 @type ngramMap: dict 476 @param n: 477 The n in n-gram. Specify the maximum size of the n-grams to 478 generate. 479 @type n: int 480 @param append: 481 Indicate weither the n-grams should be appened to the database. 482 @type append: bool 483 """ 484 lg.info("Writing result to " + self.dbFile) 485 self.callback(0, 'writing ' + str(n) + '-grams to ' + self.dbFile) 486 insert_ngrams(ngramMap, n, self.dbFile, append, True, self.callback) 487 lg.info('n-grams successfully added to the database')
488
489 490 -class CorpusMiner(TextMiner):
491 """The miner for text corpus. 492 493 This miner is basically a L{minr.TextMiner} wrapper that implement the 494 mine() method which merely loops on every files of the corpus and call the 495 L{minr.TextMiner.update_db} method to effectively do the mining operation. 496 497 G{classtree CorpusMiner} 498 """ 499
500 - def __init__(self, config, minerName, callback=None):
501 """Constructor of the CorpusMiner class. 502 503 @param config: 504 The configuration file. It is used to retrieve the miner parameters. 505 @type config: L{drvr.Configuration} 506 @param minerName: 507 The name of the miner. 508 @type minerName: str 509 @param callback: 510 The callback is used to show the progress percentage. In the gui a 511 callback method is implemented to update a progress bar showing the 512 n-grams insertion progress (cf. py). 513 @type callback: fun(float, ...) 514 """ 515 super().__init__(config, minerName, callback) 516 self.callback = callback 517 self.corpusFiles = self.config.getas(self.name, 'texts', 'list')
518
519 - def mine(self):
520 """Perform the mining operation.""" 521 for text in self.corpusFiles: 522 self.update_db(text) 523 self.callback(100, 'Done')
524
525 526 -class DictMiner(Miner):
527 """A miner to mine dictionary-like files. 528 529 This miner isn't a real miner as it only extract words from a 530 dictionary-like file and insert them into a database. 531 A dictionnary-like file is a file listing words, one word per line:: 532 about 533 army 534 bath 535 boat 536 ... 537 538 G{classtree DictMiner} 539 """ 540
541 - def __init__(self, config, minerName, callback=None):
542 """Constructor of the DictMiner class. 543 544 @param config: 545 The configuration file. It is used to retrieve the miner parameters. 546 @type config: L{drvr.Configuration} 547 @param minerName: 548 The name of the miner. 549 @type minerName: str 550 @param callback: 551 The callback is used to show the progress percentage. In the gui a 552 callback method is implemented to update a progress bar showing the 553 n-grams insertion progress (cf. py). 554 @type callback: fun(float, ...) 555 """ 556 super().__init__(config, minerName, callback) 557 self.dictFile = self.config.getas(self.name, 'dictionary') 558 self.database = self.config.getas(self.name, 'database')
559
560 - def mine(self):
561 """Perform the mining operation. 562 563 @note: This method could have used the update_db() method like the C 564 orpusMiner and FbMiner do but this method avoid useless operations 565 and is, therefore, faster. 566 567 @todo 0.0.9: 568 Make sure every lines of the file contain one single word (or none). 569 """ 570 progress = 0 571 sql = SqliteDatabaseConnector(self.database) 572 sql.crt_ngram_table() 573 with open(self.dictFile) as dictFile: 574 for i, l in enumerate(dictFile): 575 pass 576 noLines = i + 1 577 with open(self.dictFile) as dictFile: 578 for word in dictFile: 579 ngram = [word.strip('\n').lower()] 580 oldCount = sql.ngram_count(ngram) 581 if oldCount > 0: 582 sql.update_ngram(ngram, oldCount + 1) 583 else: 584 sql.insert_ngram(ngram, 1) 585 progress += 100 / noLines 586 if self.callback: 587 self.callback(progress) 588 sql.commit() 589 sql.crt_index(1) 590 sql.close_database()
591
592 - def insert_words(self):
593 progress = 0 594 sql = SqliteDatabaseConnector(self.database) 595 sql.crt_ngram_table() 596 with open(self.dictFile) as dictFile: 597 for i, l in enumerate(dictFile): 598 pass 599 noLines = i + 1 600 with open(self.dictFile) as dictFile: 601 for word in dictFile: 602 ngram = [word.strip('\n').lower()] 603 oldCount = sql.ngram_count(ngram) 604 if oldCount > 0: 605 sql.update_ngram(ngram, oldCount + 1) 606 else: 607 sql.insert_ngram(ngram, 1) 608 progress += 100 / noLines 609 if self.callback: 610 self.callback(progress) 611 sql.commit() 612 sql.crt_index(1) 613 sql.close_database()
614
615 616 -class FacebookMiner(TextMiner):
617 """The Facebook user profile miner. 618 619 This miner use an access token to access a user facebook wall and retrieve 620 its text message. While the messages's text is retrieve, the miner write 621 them into a txt file. Once every messages have been write to the file the 622 miner generate n-grams from the file and insert them in the database, using 623 the L{minr.TextMiner} methods. 624 625 G{classtree FacebookMiner} 626 627 @note: The miner do not retrieve every facebook wall messages each time. 628 When mining a facebook wall he saves the published date of the latest 629 message and on next mining it will only retrieved the messages that have 630 been published AFTER the saved date. 631 See: L{minr.FacebookMiner.update_fb} 632 633 @todo 0.2.0: 634 Create web app in order to log the user to facebook and twitter, 635 authenticate them and ask for permissions (if needed) and finaly get 636 access token. 637 """ 638
639 - def __init__(self, config, minerName, callback=None):
640 """Constructor of the FacebookMiner class. 641 642 @param config: 643 The configuration file. It is used to retrieve the miner parameters. 644 @type config: L{drvr.Configuration} 645 @param minerName: 646 The name of the miner. 647 @type minerName: str 648 @param callback: 649 The callback is used to show the progress percentage. In the gui a 650 callback method is implemented to update a progress bar showing the 651 n-grams insertion progress (cf. py). 652 @type callback: fun(float, ...) 653 """ 654 super().__init__(config, minerName, callback) 655 self.fbFile = 'fb.txt' 656 self.callback = callback 657 self.accessToken = self.config.getas(self.name, 'accesstoken') 658 self.previousLast = self.config.getas(self.name, 'last_update', 'int')
659
660 - def mine(self):
661 """Perform the mining operation.""" 662 if not self.accessToken is None: 663 try: 664 jsonText = self.get_user_details() 665 jsonText = loads(jsonText) 666 userInfo = loads(jsonText) 667 except TypeError: 668 self.callback(0, 'error: Faebook access token is invalid') 669 lg.error('Facebook access token is invalid') 670 return 671 else: 672 self.callback(0, 'error: Faebook access token is missing') 673 lg.error('Facebook access token is missing') 674 return 675 self.callback(0, 'Mining facebook user posts') 676 PAGE_ID = userInfo['id'] 677 graph = GraphAPI(self.accessToken) 678 profile = graph.get_object(PAGE_ID) 679 posts = graph.get_connections(profile['id'], 'posts') 680 self.update_fb(posts) 681 self.update_db(self.fbFile) 682 self.callback(100, 'Done')
683
684 - def write_to_file(self, post, fo):
685 """Write a facebook post message to a file. 686 687 @param post: 688 A facebook post is a dictionary. If the post contains a textual 689 message then it is associated to the 'message' key. 690 @type post: dict 691 @param fo: 692 The file to write in. 693 @type fo: TextIOWrapper 694 """ 695 if "message" in post: 696 fo.write(post['message'].encode('utf-8') + "\n".encode('ascii'))
697
698 - def scrap_fb(self, posts):
699 """Fully mine every posts of a facebook profile. 700 701 Mine the posts contained in the "posts" parameter (which should contains 702 the latest posts) and keep requesting older posts until we reach the 703 last post. If a post contains a textual message it is automatically 704 written in the file at self.fbFile. 705 The latest post unix time is computed and written in the config. 706 707 @note: It is not possible to get every posts of a facebook profile in a 708 single request (except if the profile contains very few posts).So 709 The method must scan the "posts" dictionary and request the older 710 posts until the request return a posts dictionary. 711 712 @param posts: 713 "posts" is returned by GraphAPI.get_connections(), it contains 714 posts of a facebook profile. 715 @type posts: dict 716 """ 717 fo = open(self.fbFile, "wb") 718 lastPostTime = '' 719 while True: 720 try: 721 if lastPostTime == '': 722 lastPostTime = mktime(datetime.strptime( 723 posts['data'][0]['created_time'], 724 "%Y-%m-%dT%H:%M:%S+0000").timetuple()) 725 for post in posts['data']: 726 self.write_to_file(post, fo) 727 posts = get(posts['paging']['next']).json() 728 except KeyError: 729 break 730 fo.close() 731 if lastPostTime: 732 self.config[self.name]['LAST_UPDATE'] = \ 733 str(int(float(lastPostTime)))
734
735 - def update_fb(self, posts):
736 """Mine posts of a facebook profile since the last mining operation. 737 738 Mine the posts contained in the "posts" parameter (which should contains 739 the latest posts) and keep requesting older posts until: 740 - We reach a post that has already be mined (comparison is carried 741 out using unix time and the 'last_update' config option) 742 - We reach the last post of the facebook profile. 743 If a post contains a textual message it is automatically written in 744 the file at self.fbFile. 745 The latest post unix time is computed and written in the config so that 746 we know which posts have been published after this one the next time 747 the method is called. 748 749 @note: It is not possible to get every posts of a facebook profile in a 750 single request (except if the profile contains very few posts). 751 So The method must scan the "posts" dictionary and request the 752 older posts until the request return a posts dictionary. 753 754 @param posts: 755 "posts" is returned by GraphAPI.get_connections(), it contains posts 756 of a facebook profile. 757 @type posts: dict 758 """ 759 fo = open(self.fbFile, "wb") 760 lastPostTime = '' 761 stop = False 762 if self.previousLast: 763 previousLast = self.previousLast 764 else: 765 previousLast = FB_CREATION_TIME 766 while True: 767 try: 768 if lastPostTime == '': 769 lastPostTime = mktime(datetime.strptime( 770 posts['data'][0]['created_time'], 771 "%Y-%m-%dT%H:%M:%S+0000").timetuple()) 772 diff = lastPostTime - previousLast 773 else: 774 try: 775 # not very regular, particularly on first mining 776 self.callback(float(100 * previousLast / mktime( 777 datetime.strptime( 778 posts['data'][0]['created_time'], 779 "%Y-%m-%dT%H:%M:%S+0000").timetuple()))) 780 except IndexError: 781 pass 782 for post in posts['data']: 783 postTime = mktime(datetime.strptime( 784 post['created_time'], 785 "%Y-%m-%dT%H:%M:%S+0000").timetuple()) 786 if postTime > previousLast: 787 self.write_to_file(post, fo) 788 else: 789 stop = True 790 break 791 if stop: 792 break 793 posts = get(posts['paging']['next']).json() 794 except KeyError: 795 break 796 fo.close() 797 if lastPostTime: 798 self.config[self.name]['last_update'] = \ 799 str(int(float(lastPostTime)))
800
801 - def get_user_details(self):
802 """Use the facebook access token to get details about the user. 803 804 @return: 805 The user details or an empty dictionary if the request fail wich 806 probably means that the access token is invalid or outdated. 807 @rtype: dict 808 """ 809 jDict = {} 810 url = "https://graph.facebook.com/me?access_token=" + self.accessToken 811 try: 812 response = urlopen(Request(url)) 813 jDict = dumps(response.read().decode('utf-8')) 814 except Exception: 815 pass 816 return jDict
817
818 - def rm_db(self):
819 """Override the parent method. 820 821 This method delete the database file and also set the last_update option 822 of the facebook miner to the oldest value possible so that the facebook 823 account will be fully scraped on next mining operation. 824 """ 825 system("rm %s" % (self.dbFile)) 826 self.config['FbMiner']['last_update'] = str(FB_CREATION_TIME)
827