from news_util import walk_news
import sys, os

from mrjob.protocol import JSONValueProtocol


def encode_document(text, cats=None, id=None):
    """Encode a document as a JSON so that MRTextClassifier can read it.

    Args:
    text -- the text of the document (as a unicode)
    cats -- a dictionary mapping a category name (e.g. 'sports') to True if
        the document is in the category, and False if it's not. None indicates
        that we have no information about this documents' categories
    id -- a unique ID for the document (any kind of JSON-able value should
        work). If not specified, we'll auto-generate one.
    """
    text = unicode(text, errors='ignore')
    cats = dict((unicode(cat), bool(is_in_cat))
                for cat, is_in_cat
                in (cats or {}).iteritems())

    return JSONValueProtocol.write(
        None, {'document': text, 'cats': cats, 'docid': id, 'type' : 'document'}) + '\n'



root = os.path.abspath(sys.argv[1])
outroot = os.path.abspath(sys.argv[2])

def encode(category, fname, root):
    global outroot
    try:
        os.mkdir(os.path.join(outroot, category))
    except:
        pass

    with file(os.path.join(root, fname), 'r') as f:
        with file(os.path.join(outroot, category, fname), 'w') as outf:
            outf.write(encode_document(f.read(), {category:1}, fname))


walk_news(root, encode)
