Package prest :: Module minr
[hide private]
[frames] | no frames]

Source Code for Module prest.minr

  1  #!/usr/bin/env python3 
  2  # -*- coding: utf-8 -*- 
  3   
  4  """The classes used for data mining. 
  5   
  6  Curently there is two data miners: 
  7      - Corpus miner: mine from text files. 
  8      - Facebook miner: mine from a facebook profile. In fact, this miner write 
  9        the profile's posts into a file and then the file is mines the same way as 
 10        a text corpus. That's why the two miners classes subclass the TextMiner 
 11        class. 
 12   
 13  @todo 0.1.0: 
 14      Implement the twitter miner which should be very similar to the facebook 
 15      miner. 
 16  """ 
 17   
 18  try: 
 19      import configparser 
 20  except ImportError: 
 21      import ConfigParser as configparser 
 22  import tknz 
 23  import db 
 24  import os 
 25  import abc 
 26  import urllib.request as urllib2 
 27  import json 
 28  import re 
 29  import requests 
 30  import time 
 31  import datetime 
 32  from lg import lg 
 33   
 34   
 35  #: Facebook creation date in unix time 
 36  FB_CREATION_TIME = 1075852860 
37 38 39 # This class is taken from the facebook python sdk package. 40 # The facebook python sdk can't be build for python 3 so I modify it to make it 41 # compatible. 42 # see: https://facebook-sdk.readthedocs.org/en/latest/ 43 -class GraphAPI(object):
44 """A client for the Facebook Graph API. 45 See http://developers.facebook.com/docs/api for complete 46 documentation for the API. 47 The Graph API is made up of the objects in Facebook (e.g., people, 48 pages, events, photos) and the connections between them (e.g., 49 friends, photo tags, and event RSVPs). This client provides access 50 to those primitive types in a generic way. For example, given an 51 OAuth access token, this will fetch the profile of the active user 52 and the list of the user's friends: 53 graph = facebook.GraphAPI(access_token) 54 user = graph.get_object("me") 55 friends = graph.get_connections(user["id"], "friends") 56 You can see a list of all of the objects and connections supported 57 by the API at http://developers.facebook.com/docs/reference/api/. 58 You can obtain an access token via OAuth or by using the Facebook 59 JavaScript SDK. See 60 http://developers.facebook.com/docs/authentication/ for details. 61 If you are using the JavaScript SDK, you can use the 62 get_user_from_cookie() method below to get the OAuth access token 63 for the active user from the cookie saved by the SDK. 64 """ 65
66 - def __init__(self, access_token=None, timeout=None, version=None):
67 # The default version is only used if the version kwarg does not exist. 68 default_version = "2.0" 69 valid_API_versions = ["2.0", "2.1", "2.2", "2.3"] 70 71 self.access_token = access_token 72 self.timeout = timeout 73 74 if version: 75 version_regex = re.compile("^\d\.\d$") 76 match = version_regex.search(str(version)) 77 if match is not None: 78 if str(version) not in valid_API_versions: 79 raise GraphAPIError("Valid API versions are " + 80 str(valid_API_versions).strip('[]')) 81 else: 82 self.version = "v" + str(version) 83 else: 84 raise GraphAPIError("Version number should be in the" 85 " following format: #.# (e.g. 2.0).") 86 else: 87 self.version = "v" + default_version
88
89 - def get_object(self, id, **args):
90 """Fetchs the given object from the graph.""" 91 return self.request(self.version + "/" + id, args)
92
93 - def get_objects(self, ids, **args):
94 """Fetchs all of the given object from the graph. 95 We return a map from ID to object. If any of the IDs are 96 invalid, we raise an exception. 97 """ 98 args["ids"] = ",".join(ids) 99 return self.request(self.version + "/", args)
100
101 - def get_connections(self, id, connection_name, **args):
102 """Fetchs the connections for given object.""" 103 return self.request( 104 self.version + "/" + id + "/" + connection_name, args)
105
106 - def put_object(self, parent_object, connection_name, **data):
107 """Writes the given object to the graph, connected to the given parent. 108 For example, 109 graph.put_object("me", "feed", message="Hello, world") 110 writes "Hello, world" to the active user's wall. Likewise, this 111 will comment on a the first post of the active user's feed: 112 feed = graph.get_connections("me", "feed") 113 post = feed["data"][0] 114 graph.put_object(post["id"], "comments", message="First!") 115 See http://developers.facebook.com/docs/api#publishing for all 116 of the supported writeable objects. 117 Certain write operations require extended permissions. For 118 example, publishing to a user's feed requires the 119 "publish_actions" permission. See 120 http://developers.facebook.com/docs/publishing/ for details 121 about publishing permissions. 122 """ 123 assert self.access_token, "Write operations require an access token" 124 return self.request( 125 self.version + "/" + parent_object + "/" + connection_name, 126 post_args=data, 127 method="POST")
128
129 - def put_wall_post(self, message, attachment={}, profile_id="me"):
130 """Writes a wall post to the given profile's wall. 131 We default to writing to the authenticated user's wall if no 132 profile_id is specified. 133 attachment adds a structured attachment to the status message 134 being posted to the Wall. It should be a dictionary of the form: 135 {"name": "Link name" 136 "link": "http://www.example.com/", 137 "caption": "{*actor*} posted a new review", 138 "description": "This is a longer description of the attachment", 139 "picture": "http://www.example.com/thumbnail.jpg"} 140 """ 141 return self.put_object(profile_id, "feed", message=message, 142 **attachment)
143
144 - def put_comment(self, object_id, message):
145 """Writes the given comment on the given post.""" 146 return self.put_object(object_id, "comments", message=message)
147
148 - def put_like(self, object_id):
149 """Likes the given post.""" 150 return self.put_object(object_id, "likes")
151
152 - def delete_object(self, id):
153 """Deletes the object with the given ID from the graph.""" 154 self.request(self.version + "/" + id, method="DELETE")
155
156 - def delete_request(self, user_id, request_id):
157 """Deletes the Request with the given ID for the given user.""" 158 self.request("%s_%s" % (request_id, user_id), method="DELETE")
159
160 - def put_photo(self, image, album_path="me/photos", **kwargs):
161 """ 162 Upload an image using multipart/form-data. 163 image - A file object representing the image to be uploaded. 164 album_path - A path representing where the image should be uploaded. 165 """ 166 return self.request( 167 self.version + "/" + album_path, 168 post_args=kwargs, 169 files={"source": image}, 170 method="POST")
171
172 - def get_version(self):
173 """Fetches the current version number of the Graph API being used.""" 174 args = {"access_token": self.access_token} 175 try: 176 response = requests.request("GET", 177 "https://graph.facebook.com/" + 178 self.version + "/me", 179 params=args, 180 timeout=self.timeout) 181 except requests.HTTPError as e: 182 response = json.loads(e.read()) 183 raise GraphAPIError(response) 184 185 try: 186 headers = response.headers 187 version = headers["facebook-api-version"].replace("v", "") 188 return float(version) 189 except Exception: 190 raise GraphAPIError("API version number not available")
191
192 - def request( 193 self, path, args=None, post_args=None, files=None, method=None):
194 """Fetches the given path in the Graph API. 195 We translate args to a valid query string. If post_args is 196 given, we send a POST request to the given path with the given 197 arguments. 198 """ 199 args = args or {} 200 201 if post_args is not None: 202 method = "POST" 203 204 if self.access_token: 205 if post_args is not None: 206 post_args["access_token"] = self.access_token 207 else: 208 args["access_token"] = self.access_token 209 210 try: 211 response = requests.request(method or "GET", 212 "https://graph.facebook.com/" + 213 path, 214 timeout=self.timeout, 215 params=args, 216 data=post_args, 217 files=files) 218 except requests.HTTPError as e: 219 response = json.loads(e.read()) 220 raise GraphAPIError(response) 221 222 headers = response.headers 223 if 'json' in headers['content-type']: 224 result = response.json() 225 elif 'image/' in headers['content-type']: 226 mimetype = headers['content-type'] 227 result = {"data": response.content, 228 "mime-type": mimetype, 229 "url": response.url} 230 elif "access_token" in parse_qs(response.text): 231 query_str = parse_qs(response.text) 232 if "access_token" in query_str: 233 result = {"access_token": query_str["access_token"][0]} 234 if "expires" in query_str: 235 result["expires"] = query_str["expires"][0] 236 else: 237 raise GraphAPIError(response.json()) 238 else: 239 raise GraphAPIError('Maintype was not text, image, or querystring') 240 241 if result and isinstance(result, dict) and result.get("error"): 242 raise GraphAPIError(result) 243 return result
244
245 - def fql(self, query):
246 """FQL query. 247 Example query: "SELECT affiliations FROM user WHERE uid = me()" 248 """ 249 return self.request(self.version + "/" + "fql", {"q": query})
250
251 - def get_app_access_token(self, app_id, app_secret):
252 """Get the application's access token as a string.""" 253 args = {'grant_type': 'client_credentials', 254 'client_id': app_id, 255 'client_secret': app_secret} 256 257 return self.request("oauth/access_token", args=args)["access_token"]
258
259 - def get_access_token_from_code( 260 self, code, redirect_uri, app_id, app_secret):
261 """Get an access token from the "code" returned from an OAuth dialog. 262 Returns a dict containing the user-specific access token and its 263 expiration date (if applicable). 264 """ 265 args = { 266 "code": code, 267 "redirect_uri": redirect_uri, 268 "client_id": app_id, 269 "client_secret": app_secret} 270 271 return self.request("oauth/access_token", args)
272
273 - def extend_access_token(self, app_id, app_secret):
274 """ 275 Extends the expiration time of a valid OAuth access token. See 276 <https://developers.facebook.com/roadmap/offline-access-removal/ 277 #extend_token> 278 """ 279 args = { 280 "client_id": app_id, 281 "client_secret": app_secret, 282 "grant_type": "fb_exchange_token", 283 "fb_exchange_token": self.access_token, 284 } 285 286 return self.request("oauth/access_token", args=args)
287
288 289 -class MinerRegistry(list):
290 """List every "miner" classes to be used. 291 292 MinerRegistry gather every miners to be used according to the configuration 293 file. It provide methods to mine corpuses from different sources, using 294 different processing strategies and store the results in different outputs. 295 296 G{classtree MinerRegistry} 297 """ 298
299 - def __init__(self, config):
300 """ Constructor of the MinerRegistry class. 301 302 @param config: 303 The configuration file. It is used to retrieve the miners classes 304 names that will be added to the MinerRegistry. 305 @type config: L{drvr.Configuration} 306 """ 307 self.config = config 308 self.set_miners() # TODO: remove (never evaluated)
309
310 - def set_miners(self):
311 """Add miner class(es) to the list according to the configuration. 312 313 Retrieve the miners classes names from the configuration and try to 314 add them to the list. 315 """ 316 self[:] = [] 317 for miner in self.config.getas('MinerRegistry', 'miners', 'list'): 318 self.add_miner(miner)
319
320 - def add_miner(self, minerName):
321 """Try to add a miner class to the list using its name. 322 323 Get the miner class name from the configuration and create an instance 324 of this class if it exists, then, add the instance to the list. 325 326 @param minerName: 327 The name of the miner. It must correspond to a section of the 328 configuration so that its miner class can be retrieved. 329 @type minerName: str 330 """ 331 try: 332 minerClass = self.config[minerName]['class'] 333 except KeyError: 334 print("Config file is incorrect. Check the miner key of the" 335 " %s section if exists or create it." % (minerName)) 336 if minerClass == 'CorpusMiner': 337 miner = CorpusMiner(self.config, minerName) 338 elif minerClass == 'FacebookMiner': 339 miner = CorpusMiner(self.config, minerName) 340 else: 341 print('WARNING: miner class "%s" is unknown. miner won\'t be' 342 'added to the registry.' % minerName) 343 miner = None 344 if miner: 345 self.append(miner)
346
347 - def mine(self):
348 """Use the miner instances list to mine the sources. 349 350 Loop through every miner instances of the list and call their mine() 351 method to perform their mining operation. 352 """ 353 for miner in self: 354 miner.mine()
355
356 - def close_databases(self):
357 """Close database of every miner instances using a database.""" 358 for miner in self: 359 if callable(getattr(miner, "close_database", None)): 360 miner.close_database()
361
362 363 -class Miner(object):
364 """Abstract class for all miners. 365 366 G{classtree Miner} 367 """ 368 369 __metaclass__ = abc.ABCMeta 370
371 - def __init__(self, config=None, minerName='None', callback=None):
372 self.config = config 373 self.name = minerName 374 self.callback = callback 375 self.dbFile = self.config.getas(self.name, 'dbfilename')
376
377 - def rm_db(self):
378 """Remove the database file (call os.system).""" 379 os.system("rm %s" % (self.dbFile))
380 381 @abc.abstractmethod
382 - def mine(self):
383 raise NotImplementedError("Method must be implemented")
384
385 386 -class TextMiner(Miner):
387 """The miner for text files. 388 389 This miner mines text files by extracting valid n-grams from them and 390 inserting them in databases. Mining a text require: 391 - Tokenizing the text. 392 - Extracting n-grams. 393 - Inserting n-grams in a database in a special way. 394 395 @see: L{tknz.TextTokenizer}, L{db.insert_ngrams} 396 397 G{classtree TextMiner} 398 """ 399
400 - def __init__(self, config, minerName, callback=None):
401 """Constructor of the FacebookMiner class. 402 403 @param config: 404 The configuration file. It is used to retrieve the miner parameters. 405 @type config: L{drvr.Configuration} 406 @param minerName: 407 The name of the miner. 408 @type minerName: str 409 @param callback: 410 The callback is used to show the progress percentage. In the gui a 411 callback method is implemented to update a progress bar showing the 412 n-grams insertion progress (cf. gui.py). 413 @type callback: fun(float, ...) 414 """ 415 super().__init__(config, minerName, callback) 416 self.lowercase = self.config.getas(self.name, 'lowercase') 417 self.n = self.config.getas(self.name, 'n', 'int')
418
419 - def update_db(self, textPath):
420 """Mine a text file, updating the database. 421 422 @param textPath: 423 The path to the text file to mine. 424 @type textPath: str 425 """ 426 for i in range(1, self.n + 1): 427 self.add_to_db(self.crt_ngram_map(textPath, i), i, True)
428
429 - def crt_new_db(self, textPath):
430 """Mine a text file. 431 432 This method dosen't try to update the n-grams counts so it will fail if 433 it tries to add an n-gram which is already in the database but this 434 method is a little faster than update_db(). 435 436 @note: If you're intending to create a new database but it already 437 exists please consider calling rm_db() first. 438 439 @param textPath: 440 The path to the text file to mine. 441 @type textPath: str 442 """ 443 for i in range(1, self.n + 1): 444 self.add_to_db(self.crt_ngram_map(textPath, i), i, False)
445
446 - def crt_ngram_map(self, textPath, n):
447 """Create a n-gram dictionary from a file. 448 449 @param textPath: 450 The path to the text file to mine. 451 @type textPath: str 452 @param n: 453 The n in n-gram. Specify the maximum size of the n-grams to 454 generate. 455 @type n: int 456 457 @return: 458 The n-gram dictionary. 459 @rtype: dict 460 """ 461 lg.info("Parsing " + str(n) + "-grams from " + textPath) 462 self.callback(0, 'parsing ' + str(n) + '-grams from ' + textPath) 463 self.tokenizer = tknz.TextTokenizer( 464 textPath, n, self.lowercase, 0, self.callback) 465 ngramMap = self.tokenizer.tknize_text() 466 lg.info( 467 str(len(ngramMap)) + ' ngrams have been extracted from ' + textPath) 468 return ngramMap
469
470 - def add_to_db(self, ngramMap, n, append=False):
471 """Add n-grams of an n-gram dictionary to the database. 472 473 @param ngramMap: 474 The n-gram dictionnary returned by tknz.TextTokenizer.tknize_text(). 475 See the above-mentioned method docstring for more information. 476 @type ngramMap: dict 477 @param n: 478 The n in n-gram. Specify the maximum size of the n-grams to 479 generate. 480 @type n: int 481 @param append: 482 Indicate weither the n-grams should be appened to the database. 483 @type append: bool 484 """ 485 lg.info("Writing result to " + self.dbFile) 486 self.callback(0, 'writing ' + str(n) + '-grams to ' + self.dbFile) 487 db.insert_ngrams(ngramMap, n, self.dbFile, append, True, self.callback) 488 lg.info('n-grams successfully added to the database')
489
490 491 -class CorpusMiner(TextMiner):
492 """The miner for text corpus. 493 494 This miner is basically a L{minr.TextMiner} wrapper that implement the 495 mine() method which merely loops on every files of the corpus and call the 496 L{minr.TextMiner.update_db} method to effectively do the mining operation. 497 498 G{classtree CorpusMiner} 499 """ 500
501 - def __init__(self, config, minerName, callback=None):
502 """Constructor of the CorpusMiner class. 503 504 @param config: 505 The configuration file. It is used to retrieve the miner parameters. 506 @type config: L{drvr.Configuration} 507 @param minerName: 508 The name of the miner. 509 @type minerName: str 510 @param callback: 511 The callback is used to show the progress percentage. In the gui a 512 callback method is implemented to update a progress bar showing the 513 n-grams insertion progress (cf. gui.py). 514 @type callback: fun(float, ...) 515 """ 516 super().__init__(config, minerName, callback) 517 self.callback = callback 518 self.corpusFiles = self.config.getas(self.name, 'texts', 'list')
519
520 - def mine(self):
521 """Perform the mining operation.""" 522 for text in self.corpusFiles: 523 self.update_db(text) 524 self.callback(100, 'Done')
525
526 527 -class DictMiner(Miner):
528 """A miner to mine dictionary-like files. 529 530 This miner isn't a real miner as it only extract words from a 531 dictionary-like file and insert them into a database. 532 A dictionnary-like file is a file listing words, one word per line:: 533 about 534 army 535 bath 536 boat 537 ... 538 539 G{classtree DictMiner} 540 """ 541
542 - def __init__(self, config, minerName, callback=None):
543 """Constructor of the DictMiner class. 544 545 @param config: 546 The configuration file. It is used to retrieve the miner parameters. 547 @type config: L{drvr.Configuration} 548 @param minerName: 549 The name of the miner. 550 @type minerName: str 551 @param callback: 552 The callback is used to show the progress percentage. In the gui a 553 callback method is implemented to update a progress bar showing the 554 n-grams insertion progress (cf. gui.py). 555 @type callback: fun(float, ...) 556 """ 557 super().__init__(config, minerName, callback) 558 self.dictFile = self.config.getas(self.nale, 'dictionary') 559 self.database = self.config.getas(self.name, 'dbfilename')
560
561 - def mine(self):
562 """Perform the mining operation. 563 564 @note: This method could have used the update_db() method like the C 565 orpusMiner and FbMiner do but this method avoid useless operations 566 and is, therefore, faster. 567 568 @todo 0.0.2: 569 Make sure every lines of the file contain one single word (or none). 570 """ 571 progress = 0 572 sql = db.SqliteDatabaseConnector(self.database) 573 sql.crt_ngram_table() 574 with open(self.dictFile) as dictFile: 575 for i, l in enumerate(dictFile): 576 pass 577 noLines = i + 1 578 with open(self.dictFile) as dictFile: 579 for word in dictFile: 580 ngram = [word.strip('\n').lower()] 581 oldCount = sql.ngram_count(ngram) 582 if oldCount > 0: 583 sql.update_ngram(ngram, oldCount + 1) 584 else: 585 sql.insert_ngram(ngram, 1) 586 progress += 100 / noLines 587 if self.callback: 588 self.callback(progress) 589 sql.commit() 590 sql.crt_index(1) 591 sql.close_database()
592
593 - def insert_words(self):
594 progress = 0 595 sql = db.SqliteDatabaseConnector(self.database) 596 sql.crt_ngram_table() 597 with open(self.dictFile) as dictFile: 598 for i, l in enumerate(dictFile): 599 pass 600 noLines = i + 1 601 with open(self.dictFile) as dictFile: 602 for word in dictFile: 603 ngram = [word.strip('\n').lower()] 604 oldCount = sql.ngram_count(ngram) 605 if oldCount > 0: 606 sql.update_ngram(ngram, oldCount + 1) 607 else: 608 sql.insert_ngram(ngram, 1) 609 progress += 100 / noLines 610 if self.callback: 611 self.callback(progress) 612 sql.commit() 613 sql.crt_index(1) 614 sql.close_database()
615
616 617 -class FacebookMiner(TextMiner):
618 """The Facebook user profile miner. 619 620 This miner use an access token to access a user facebook wall and retrieve 621 its text message. While the messages's text is retrieve, the miner write 622 them into a txt file. Once every messages have been write to the file the 623 miner generate n-grams from the file and insert them in the database, using 624 the L{minr.TextMiner} methods. 625 626 @note: The miner do not retrieve every facebook wall messages each time. 627 When mining a facebook wall he saves the published date of the latest 628 message and on next mining it will only retrieved the messages that have 629 been published AFTER the saved date. 630 See: L{minr.FacebookMiner.update_fb} 631 632 @todo 0.2.0: 633 Create web app in order to log the user to facebook and twitter, 634 authenticate them and ask for permissions (if needed) and finaly get 635 access token. 636 637 G{classtree FacebookMiner} 638 """ 639
640 - def __init__(self, config, minerName, callback=None):
641 """Constructor of the FacebookMiner class. 642 643 @param config: 644 The configuration file. It is used to retrieve the miner parameters. 645 @type config: L{drvr.Configuration} 646 @param minerName: 647 The name of the miner. 648 @type minerName: str 649 @param callback: 650 The callback is used to show the progress percentage. In the gui a 651 callback method is implemented to update a progress bar showing the 652 n-grams insertion progress (cf. gui.py). 653 @type callback: fun(float, ...) 654 """ 655 super().__init__(config, minerName, callback) 656 self.fbFile = 'fb.txt' 657 self.callback = callback 658 self.accessToken = self.config.getas(self.name, 'accesstoken') 659 self.previousLast = self.config.getas(self.name, 'last_update', 'int')
660
661 - def mine(self):
662 """Perform the mining operation.""" 663 if not self.accessToken is None: 664 try: 665 jsonText = self.get_user_details() 666 jsonText = json.loads(jsonText) 667 userInfo = json.loads(jsonText) 668 except TypeError: 669 self.callback(0, 'error: Faebook access token is invalid') 670 lg.error('Facebook access token is invalid') 671 return 672 else: 673 self.callback(0, 'error: Faebook access token is missing') 674 lg.error('Facebook access token is missing') 675 return 676 self.callback(0, 'Mining facebook user posts') 677 PAGE_ID = userInfo['id'] 678 graph = GraphAPI(self.accessToken) 679 profile = graph.get_object(PAGE_ID) 680 posts = graph.get_connections(profile['id'], 'posts') 681 self.update_fb(posts) 682 self.update_db(self.fbFile) 683 self.callback(100, 'Done')
684
685 - def write_to_file(self, post, fo):
686 """Write a facebook post message to a file. 687 688 @param post: 689 A facebook post is a dictionary. If the post contains a textual 690 message then it is associated to the 'message' key. 691 @type post: dict 692 @param fo: 693 The file to write in. 694 @type fo: TextIOWrapper 695 """ 696 if "message" in post: 697 fo.write(post['message'].encode('utf-8') + "\n".encode('ascii'))
698
699 - def scrap_fb(self, posts):
700 """Fully mine every posts of a facebook profile. 701 702 Mine the posts contained in the "posts" parameter (which should contains 703 the latest posts) and keep requesting older posts until we reach the 704 last post. If a post contains a textual message it is automatically 705 written in the file at self.fbFile. 706 The latest post unix time is computed and written in the config. 707 708 @note: It is not possible to get every posts of a facebook profile in a 709 single request (except if the profile contains very few posts).So 710 The method must scan the "posts" dictionary and request the older 711 posts until the request return a posts dictionary. 712 713 @param posts: 714 "posts" is returned by GraphAPI.get_connections(), it contains 715 posts of a facebook profile. 716 @type posts: dict 717 """ 718 fo = open(self.fbFile, "wb") 719 lastPostTime = '' 720 while True: 721 try: 722 if lastPostTime == '': 723 lastPostTime = time.mktime(datetime.datetime.strptime( 724 posts['data'][0]['created_time'], 725 "%Y-%m-%dT%H:%M:%S+0000").timetuple()) 726 for post in posts['data']: 727 self.write_to_file(post, fo) 728 posts = requests.get(posts['paging']['next']).json() 729 except KeyError: 730 break 731 fo.close() 732 if lastPostTime: 733 self.config[self.name]['LAST_UPDATE'] = \ 734 str(int(float(lastPostTime)))
735
736 - def update_fb(self, posts):
737 """Mine posts of a facebook profile since the last mining operation. 738 739 Mine the posts contained in the "posts" parameter (which should contains 740 the latest posts) and keep requesting older posts until: 741 - We reach a post that has already be mined (comparison is carried 742 out using unix time and the 'last_update' config option) 743 - We reach the last post of the facebook profile. 744 If a post contains a textual message it is automatically written in 745 the file at self.fbFile. 746 The latest post unix time is computed and written in the config so that 747 we know which posts have been published after this one the next time 748 the method is called. 749 750 @note: It is not possible to get every posts of a facebook profile in a 751 single request (except if the profile contains very few posts). 752 So The method must scan the "posts" dictionary and request the 753 older posts until the request return a posts dictionary. 754 755 @param posts: 756 "posts" is returned by GraphAPI.get_connections(), it contains posts 757 of a facebook profile. 758 @type posts: dict 759 """ 760 fo = open(self.fbFile, "wb") 761 lastPostTime = '' 762 stop = False 763 if self.previousLast: 764 previousLast = self.previousLast 765 else: 766 previousLast = FB_CREATION_TIME 767 while True: 768 try: 769 if lastPostTime == '': 770 lastPostTime = time.mktime(datetime.datetime.strptime( 771 posts['data'][0]['created_time'], 772 "%Y-%m-%dT%H:%M:%S+0000").timetuple()) 773 diff = lastPostTime - previousLast 774 else: 775 try: 776 # not very regular, particularly on first mining 777 self.callback(float(100 * previousLast / time.mktime( 778 datetime.datetime.strptime( 779 posts['data'][0]['created_time'], 780 "%Y-%m-%dT%H:%M:%S+0000").timetuple()))) 781 except IndexError: 782 pass 783 for post in posts['data']: 784 postTime = time.mktime(datetime.datetime.strptime( 785 post['created_time'], 786 "%Y-%m-%dT%H:%M:%S+0000").timetuple()) 787 if postTime > previousLast: 788 self.write_to_file(post, fo) 789 else: 790 stop = True 791 break 792 if stop: 793 break 794 posts = requests.get(posts['paging']['next']).json() 795 except KeyError: 796 break 797 fo.close() 798 if lastPostTime: 799 self.config[self.name]['last_update'] = \ 800 str(int(float(lastPostTime)))
801
802 - def get_user_details(self):
803 """Use the facebook access token to get details about the user. 804 805 @return: 806 The user details or an empty dictionary if the request fail wich 807 probably means that the access token is invalid or outdated. 808 @rtype: dict 809 """ 810 jDict = {} 811 url = "https://graph.facebook.com/me?access_token=" + self.accessToken 812 try: 813 response = urllib2.urlopen(urllib2.Request(url)) 814 jDict = json.dumps(response.read().decode('utf-8')) 815 except Exception: 816 pass 817 return jDict
818
819 - def rm_db(self):
820 """Override the parent method. 821 822 This method delete the database file and also set the last_update option 823 of the facebook miner to the oldest value possible so that the facebook 824 account will be fully scraped on next mining operation. 825 """ 826 os.system("rm %s" % (self.dbFile)) 827 self.config['FbMiner']['last_update'] = str(FB_CREATION_TIME)
828