events_analysis.py

#!/usr/bin/env python

##########################################################################
#                                                                        #
#  This program is free software; you can redistribute it and/or modify  #
#  it under the terms of the GNU General Public License as published by  #
#  the Free Software Foundation; version 2 of the License.               #
#                                                                        #
#  This program is distributed in the hope that it will be useful,       #
#  but WITHOUT ANY WARRANTY; without even the implied warranty of        #
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         #
#  GNU General Public License for more details.                          #
#                                                                        #
##########################################################################

from __future__ import division
from datetime import date, timedelta
from sonet.mediawiki import is_archive, HistoryPageProcessor
from sqlalchemy import select, func
from base64 import b64decode
from zlib import decompress
from wbin import deserialize
import csv
from os import path

from django.utils.encoding import smart_str

from sonet.models import get_events_table
from sonet import lib

## GLOBAL VARS
initial_date = date(2000, 1, 1)


def page_iter(lang='en', paginate=10000000, desired=None):
    events, conn = get_events_table()

    count_query = select([func.count(events.c.id)],
               events.c.lang == lang)
    s = select([events.c.title, events.c.data, events.c.talk,
                events.c.total_editors, events.c.bot_editors,
                events.c.anonymous_editors],
                events.c.lang == lang).order_by(
        events.c.title, events.c.talk).limit(paginate)

    ## searching only desired pages
    if desired:
        s = s.where(events.c.title.in_(desired))
        count_query = count_query.where(events.c.title.in_(desired))

    count = conn.execute(count_query).fetchall()[0][0]

    print 'PAGES:', count

    for offset in xrange(0, count, paginate):
        rs = conn.execute(s.offset(offset))
        for row in rs:
            yield (row[0],
                   deserialize(decompress(b64decode(row[1]))),
                   row[2], row[3], row[4], row[5])


def get_days_since(start_date, end_date, anniversary_date, td_list):
    """
    Returns the number of days passed between two dates. If the considered date
    is an anniversary, count the number of days in the range around the
    anniversary for each year

    >>> td = [timedelta(i) for i in range(-10,11)]
    >>> get_days_since(date(2001, 9, 11), date(2005, 9, 19), None, td)
    1470
    >>> get_days_since(date(2010, 9, 11), date(2005, 9, 19), None, td)
    0
    >>> get_days_since(date(2005, 9, 16), date(2005, 9, 19), None, td)
    4
    >>> get_days_since(date(2001,9,11),date(2010,7,29),date(2001,9,11),td)
    179
    >>> get_days_since(date(2001,9,22),date(2010,7,29),date(2001,9,11),td)
    168
    >>> get_days_since(date(2006,1,7),date(2006,7,7),date(2005,7,7),td)
    11
    >>> get_days_since(date(2010,2,4),date(2010,7,29),date(1952,8,4),td)
    5
    >>> td = [timedelta(i) for i in range(-50,51)]
    >>> get_days_since(date(2001,12,30),date(2002,1,1),date(2001,12,30),td)
    3
    >>> td = [timedelta(i) for i in range(-20,21)]
    >>> get_days_since(date(2001,1,1),date(2001,12,31),date(2001,6,15),td)
    41
    >>> td = [timedelta(i) for i in range(-5,6)]
    >>> get_days_since(date(2001,1,1),date(2003,1,1),date(2001,6,15),td)
    22
    >>> get_days_since(date(2001,9,22),date(2010,7,29),date(2001,9,11),None)
    8
    >>> get_days_since(date(2004,2,29),date(2010,7,29),date(2000,2,29),None)
    7
    """
    if start_date > end_date:
        return 0
    if not anniversary_date:
        return (end_date - start_date).days + 1

    counter = 0

    for year in range(start_date.year, end_date.year + 1):
        try:
            ad = date(year, anniversary_date.month, anniversary_date.day)
        except ValueError:
            ad = date(year, anniversary_date.month, anniversary_date.day - 1)

        ## TODO: introduce dateutil.rrule.between ?
        if td_list:
            counter += len([1 for d in (ad + td
                        for td in td_list)
                        if (d >= start_date and d <= end_date)])
        else:
            counter += int(ad >= start_date and ad <= end_date)

    return counter


def is_near_anniversary(creation, revision, range_):
    """
    >>> is_near_anniversary(date(2001, 9, 11), date(2005, 9, 19), 10)
    True
    >>> is_near_anniversary(date(2001, 1, 1), date(2005, 12, 30), 10)
    True
    >>> is_near_anniversary(date(2001, 12, 31), date(2005, 1, 1), 10)
    True
    >>> is_near_anniversary(date(2001, 12, 31), date(2005, 1, 14), 15)
    True
    >>> is_near_anniversary(date(2004, 2, 29), date(2005, 3, 7), 10)
    True
    >>> is_near_anniversary(date(2001, 12, 25), date(2005, 1, 1), 5)
    False
    >>> is_near_anniversary(date(2001, 12, 25), date(2001, 12, 25), 0)
    True
    >>> is_near_anniversary(date(2001, 12, 25), date(2001, 12, 24), 0)
    False
    """
    #year = creation.day if isleap(revision.year) else creation.day

    try:
        anniversary = date(revision.year, creation.month, creation.day)
    except ValueError:
        # print e, creation, revision
        anniversary = date(revision.year, creation.month, (creation.day - 1))
    delta = (revision - anniversary).days
    if abs(delta) <= range_:
        return True
    elif delta > 0:
        try:
            anniversary = date(revision.year + 1, creation.month,
                               creation.day)
        except ValueError:
            anniversary = date(revision.year + 1, creation.month,
                               (creation.day - 1))
        delta = (revision - anniversary).days
        return abs(delta) <= range_
    else:
        try:
            anniversary = date(revision.year - 1, creation.month,
                                   creation.day)
        except ValueError:
            anniversary = date(revision.year - 1, creation.month,
                               (creation.day - 1))
        delta = (revision - anniversary).days
        return abs(delta) <= range_


def get_first_revision(start_date, data):
    """
    >>> get_first_revision(date(2000,1,1), 2)
    >>> get_first_revision(date(2000,1,1),
    ...                    {51: 'a', 20: 'b', 10: 'c', 123: 'd'})
    datetime.date(2000, 1, 11)
    """
    try:
        return start_date + timedelta(min(data))
    except TypeError:
        return


def print_data_file(fn, dict_, s_date, e_date):
    """
    Given a filename and a dictionary of day => revisions
    it creates a csv file
    """
    s_days = (s_date - initial_date).days
    e_days = (e_date - initial_date).days

    with open(fn, 'w') as f:
        wrt = csv.writer(f)
        wrt.writerow(['date', 'total_edits', 'bot_edits', 'anon_edits'])
        for d in range(s_days, e_days + 1):
            try:
                t = dict_[d]
                wrt.writerow(
                    [(initial_date + timedelta(d)).strftime('%Y-%m-%d'),
                     t[0], t[1], t[2]]
                )
            except KeyError:
                wrt.writerow(
                    [(initial_date + timedelta(d)).strftime('%Y-%m-%d'),
                     0, 0, 0]
                )


class EventsProcessor(HistoryPageProcessor):
    count_desired = []
    count_pages = 0
    count_revisions = 0
    creation_accumulator = {}
    csv_writer = None
    desired_only = False  # search desired pages only
    desired_pages = {}
    dump_date = None
    groups = None
    lang = None
    keys_ = ['article', 'type_of_page', 'desired', 'total_edits',
             'unique_editors', 'anniversary_edits', 'n_of_anniversaries',
             'anniversary_days', 'anniversary_edits/total_edits',
             'non_anniversary_edits/total_edits', 'event_date',
             'first_edit_date', 'first_edit_date-event_date_in_days']
    output_dir = None
    pages = []
    range_ = None
    skipped_days = None
    td_list = None
    encoding = "latin-1"
    __event_date = None
    __first_edit_date = None
    __data = None
    __desired = None
    __id = None
    __n_of_anniversaries = None
    __title = None
    __type = None
    __unique_editors = 0

    def __init__(self, **kwargs):

        from subprocess import Popen, PIPE

        self.lang = kwargs['lang']
        self.range_ = kwargs['range_']
        self.skipped_days = kwargs['skip']
        self.dump_date = kwargs['dump_date']
        self.desired_only = kwargs['desired']
        self.groups = kwargs['groups']

        ## list of time delta, used in get_days_since
        ## used together with anniversary day in order to find
        ## days in anniversary's range
        self.td_list = [timedelta(i) for i in
                        range(-self.range_, self.range_ + 1)]

        if not lib.find_executable('7z'):
            raise Exception('Cannot find 7zip executable (7z)')

        fn = kwargs['output_file'] + '.bz2'

        if path.isfile(fn):
            raise Exception('Delete file ' + fn + ' before proceeding')

        zip_process = Popen(['7z', 'a', '-tbzip2', '-mx=9', fn, '-si'],
                            stdin=PIPE, stderr=None)

        self.csv_writer = csv.DictWriter(zip_process.stdin,
                                   fieldnames=self.keys_, delimiter=',',
                                   quotechar='"', quoting=csv.QUOTE_ALL)

        self.csv_writer.writeheader()

        ## Outup directory for data files (only for desired pages)
        self.output_dir = fn[0:fn.rfind('/')] + '/%s_data_files/' % self.lang
        ## check if the directory exists. if not create it
        lib.ensure_dir(self.output_dir)

    def set_desired(self, fn):
        ## save desired pages list
        for r in csv.reader(open(fn, 'rb')):
            page = r[0].decode(self.encoding).replace('_', ' ')
            if page[0] == '#':
                continue

            try:
                self.desired_pages[page] = \
                    date(int(r[1][:4]), int(r[1][5:7]), int(r[1][8:10]))
            except:
                self.desired_pages[page] = None

    def get_start_date(self):
        '''
        Returns the date to be considered as start date for the analyzed page
        considering skipped days
        '''
        sd = timedelta(self.skipped_days)

        if self.__first_edit_date == self.__event_date:
            s_date = self.__event_date + sd

        elif self.__event_date + sd > self.__first_edit_date:
            s_date = self.__event_date + sd

        else:
            s_date = self.__first_edit_date

        return s_date

    def get_days_since(self):
        '''
        Returns the number of days between the start date and the dump date
        in a range around the anniversary day, plus the number of anniversaries
        '''
        s_date = self.get_start_date()
        return get_days_since(start_date=s_date, end_date=self.dump_date,
                                  anniversary_date=self.__event_date,
                                  td_list=self.td_list)

    def get_n_anniversaries(self):
        '''
        returns number of anniversaries from the initial date
        up to the dump date
        '''
        s_date = self.get_start_date()
        return get_days_since(start_date=s_date, end_date=self.dump_date,
                                  anniversary_date=self.__event_date,
                                  td_list=None)

    def process(self, threshold=1.):
        from random import random

        des = self.desired_pages.keys() if self.desired_only else None

        for title, data, talk, te, be, ae in \
            page_iter(lang=self.lang, desired=des):
            ## check whether the page is an archive or not
            ## if it is a link, skip it!
            if is_archive(title):
                continue

            ## editors who are neither bots nor anonymous
            oe = te - be - ae

            ## page's attributes
            self.__title = title
            self.__data = data
            self.__desired = self.is_desired(self.__title)
            self.__type_of_page = talk  # 0 = article, 1 = talk
            ## unique editors
            ## skip editors belonging to not-to-be-analyzed groups
            self.__unique_editors = (
                (oe if 'total' not in self.groups else 0) +
                (be if 'bots' not in self.groups else 0) +
                (ae if 'anonymous' not in self.groups else 0)
            )

            if self.__desired and self.__title not in self.count_desired:
                print "PROCESSING DESIRED PAGE:", self.__title
                self.count_desired.append(self.__title)

            ## to skip or not to skip? This is the question...
            if not self.__desired and threshold < 1.:
                if threshold == 0. or random() > threshold:
                    self.__skip = True
                else:
                    self.__skip = False
            else:
                self.__skip = False

            ## process page
            if not self.__skip:
                self.process_page()

        self.flush()

    def process_page(self, _=None):
        """
        process a page counting all the revisions made and
        calculating some statistics as number of days since
        creation, edits in anniversary's range, etc.
        """

        ## page's (and last page as well) attributes
        title = self.__title
        talk = self.__type_of_page
        groups = self.groups

        ## creation date
        self.__first_edit_date = get_first_revision(initial_date,
                                                 self.__data)
        if self.__desired:
            if self.desired_pages[title] is not None:
                self.__event_date = self.desired_pages[title]
            else:
                self.__event_date = self.__first_edit_date
        else:
            self.__event_date = self.__first_edit_date

        ## if it is a desired page then print out data
        ## about its daily revisions
        if self.__desired:
            fn = self.output_dir + '%s%s.csv' % ('Talk:' if talk else '',
                                                 title,)
            print_data_file(fn, self.__data, self.__first_edit_date,
                            self.dump_date)

        ## if the page has been created less than one year ago, skip
        ## TODO: 365 - range??
        if (self.dump_date - self.__first_edit_date).days < 365:
            return

        anniversary = 0
        total = 0
        in_skipped = 0

        for d, t in self.__data.iteritems():
            tot_edits, bot_edits, anon_edits = t
            other_edits = tot_edits - bot_edits - anon_edits
            revision = initial_date + timedelta(d)
            if (revision - self.__event_date).days < self.skipped_days:
                in_skipped += tot_edits
                continue
            if is_near_anniversary(self.__event_date, revision, self.range_):
                ## edits made in anniversary's range
                ## skip edits made by not-to-be-analyzed groups
                anniversary += (
                    (other_edits if 'total' not in groups else 0) +
                    (bot_edits if 'bots' not in groups else 0) +
                    (anon_edits if 'anonymous' not in groups else 0)
                )
            ## total edits
            ## skip edits made by not-to-be-analyzed groups
            total += (
                (other_edits if 'total' not in groups else 0) +
                (bot_edits if 'bots' not in groups else 0) +
                (anon_edits if 'anonymous' not in groups else 0)
            )

        try:
            ann_total_edits = anniversary / total
            not_ann_total_edits = (total - anniversary) / total
        except ZeroDivisionError:
            ann_total_edits = 0.
            not_ann_total_edits = 0.

        dict_ = {
            'article': smart_str(self.__title),
            'type_of_page': int(not talk),
            'desired': int(self.__desired),
            'total_edits': total,
            'unique_editors': self.__unique_editors,
            'anniversary_edits': anniversary,
            'n_of_anniversaries': self.get_n_anniversaries(),
            'anniversary_days': self.get_days_since(),
            'anniversary_edits/total_edits': ann_total_edits,
            'non_anniversary_edits/total_edits': not_ann_total_edits,
            'event_date': self.__event_date,
            'first_edit_date': self.__first_edit_date,
            'first_edit_date-event_date_in_days': (self.__first_edit_date -
                                                   self.__event_date).days
        }

        self.pages.append(dict_)
        self.count_pages += 1
        self.count_revisions += total

        if not self.count_pages % 50000:
            self.flush()

    def flush(self):
        '''
        This functions emptys the pages' queue and write it on the
        bzipped csv file
        '''
        print 'PAGES:', self.count_pages, 'REVS:', self.count_revisions, \
            'DESIRED:', len(self.count_desired)

        self.csv_writer.writerows(self.pages)

        self.pages = []
        return


def create_option_parser():
    from optparse import OptionParser
    from sonet.lib import SonetOption

    op = OptionParser('%prog [options] file dump-date output-file',
                      option_class=SonetOption)

    op.add_option('-l', '--lang', action="store", dest="lang",
                 help="Wikipedia language (en, it, vec, ...)", default="en")
    op.add_option('-r', '--range', action="store", dest="range_",
                 help="number of days before and after anniversary date",
                 default=10, type="int")
    op.add_option('-s', '--skipped-days', action="store", dest="skip",
                 help="number of days to be skipped", default=180, type="int")
    op.add_option('-d', '--desired-only', action="store_true", dest='desired',
                 default=False, help='analysis only of desired pages')
    op.add_option('-g', '--groups', action="store", dest='groups', default='',
                 help='comma separated list of not-to-be-analyzed groups \
                 (total|bots|anonymous)')
    op.add_option('-R', '--ratio', action="store", dest="ratio",
                 help="percentage of pages to be analyzed",
                 default=1., type="float")
    op.add_option('-e', '--encoding', action="store", dest="encoding",
                 default="latin-1", help="encoding of the desired_list file")
    return op


def main():

    p = create_option_parser()
    opts, files = p.parse_args()

    try:
        desired_pages_fn, dump_date, out_file = files
    except ValueError:
        p.error("Bad number of arguments! Try with --help option")

    ## creating dump date object
    dump = lib.yyyymmdd_to_datetime(dump_date).date()

    ## list of not-to-be-analyzed groups
    groups = [g for g in opts.groups.split(',') if g]

    ## creating processor
    processor = EventsProcessor(lang=opts.lang, range_=opts.range_,
                                skip=opts.skip, dump_date=dump, groups=groups,
                                desired=opts.desired, output_file=out_file)
    processor.encoding = opts.encoding
    ## set desired pages
    processor.set_desired(desired_pages_fn)
    ## main process
    processor.process(threshold=opts.ratio)

if __name__ == "__main__":
    main()