#!/usr/bin/python
"""
Reads the output of gnokii --getsms and stores it in an sqlite3 db.

NOTE: for now, only INBOX-will be parsed correctly!

USAGE SAMPLE:
	gnokii --getsms IN 1 end | ./storesms.py
	
OPTIONS:
	-h, --help:              shows help message
	-i, --insert-incomplete  also insert linked SMS that were missing a part.
	

(c) 2009 Thomas Unterthiner

Feel free to redistribute and/or modify this under the terms of the 
2nd Version of the GNU General Public License."""

import getopt
import re
import sys
import sqlite3 as dbapi
import datetime


def setup_db():
	""" Sets up the database and creates the tables if neccessary."""
	db = dbapi.connect("sms.sqlite")
	
	# Stores the single 160-char long SMS that gnokii spits out.
	# They could be linked togother to longer, linked SMS, hence
	# we store them here for further processing.
	# fields should be clear. Processed tells us if we've already 
	# processed this entry and made it into an entry in inbox.
	db.execute("""
		CREATE TABLE IF NOT EXISTS rawdata(
			id INTEGER PRIMARY KEY,
			status TEXT,
			time DATE,
			sender NUMBER,
			text TEXT,
			msgcenter TEXT,
			part INTEGER,
			total INTEGER,
			processed BOOLEAN DEFAULT 0)""")

	db.execute("""
		CREATE TABLE IF NOT EXISTS inbox(
			id INTEGER PRIMARY KEY,
			sender NUMBER,
			text TEXT,
			time DATE)""")
	db.commit()
	return db
	
def store_entry_if_new(db, entry):
	""" Stores an entry if it's not already in the db."""
	
	stmt1 = """SELECT 1 FROM rawdata 
		WHERE status = :status AND time = :time AND sender = :sender
			AND text = :text AND msgcenter = :msgcenter AND part = :part
			AND total = :total"""
	res = db.execute(stmt1, entry)
	if not res or not res.fetchone():
		stmt2 = """
			INSERT INTO rawdata(status, time, sender, text, msgcenter, part, total)
			VALUES(:status, :time, :sender, :text, :msgcenter, :part, :total)"""
		db.execute(stmt2, entry)


def parse_input(db, inputfile):
	""" Reads gnokii-output from a file and stores it in the rawdata-table."""

	regex = [
		re.compile(r"^(?P<nr>\d+)\. Inbox Message \((?P<status>.*)\)$"),
		re.compile(r"^Date/time: (?P<time>\d+/\d+/\d+\s+\d{2}:\d{2}:\d{2}\s+(\+|-)\d+)$"),
		re.compile(r"Sender:\s*(?P<sender>\+?\d+)\s+Msg Center:(?P<msgcenter>.*)$"),
		re.compile(r"^(?P<type>Linked|Text):$"),
		re.compile(r"^Linked\s+\((?P<part>\d+)/(?P<total>\d+)\):$"),
		re.compile(r"^(?P<text>.+)$")]

	i = 0
	entry = dict({'part': 0, 'total': 1})
	for line in inputfile:
		line = line.decode('utf8')
		matches = [re.match(r, line) for r in regex]

		if matches[0] and 'text' in entry:
			store_entry_if_new(db, entry)
			entry = dict({'part': 0, 'total': 1})

		# update fields (text must be handled on its on, because there can be more text-lines)
		for m in matches[:-1]:
			if m: entry.update(m.groupdict())
		if len([1 for m in matches if m]) == 1: # we only matched text (so the entry must be text):
			if 'text' in entry:
				entry['text'] += "\n" + matches[-1].group('text')
			else:
				entry['text'] = matches[-1].group('text')


def get_partial_messages(db):
	""" Returns a list of all the entries that were linked (i.e. have more than one part).
	Each entry will be a dict. The send-time will be parsed into a datetime.datetime.
	The 'group'-element will be set to 0 for all entries. This variable will be used by
	other functions to link the partial messages back together.	
	"""

	db.row_factory = dbapi.Row
	result = db.execute("SELECT * FROM rawdata WHERE total > 1 AND processed = 0")
	data = result.fetchall()

	# datetime.datetime.strptime sucks (doesn't know %Z), so we have to parse the send-times
	entries = []
	for r in data:
		e = dict(r)
		t = r['time']
		time = datetime.datetime.strptime(t[:-6], "%d/%m/%Y %H:%M:%S")
		sign = 1 if t[-5] == '+' else -1
		hour_offset = int(t[-4:-2])
		minute_offset = int(t[-2:])
		
		dt = datetime.timedelta(hours = hour_offset, minutes = minute_offset)
		e['time'] = time + dt if sign > 0 else time - dt
		e['group'] = 0 # for grouping
		entries.append(e)
	return entries
	

def group_partial_messages(entries, maxdiff=datetime.timedelta(minutes=1) ):
	""" Groups partial messages together that might be part of a single SMS.
	Returns the entries and max_gid, which is the biggest gid that was set + 1.
		
	Entries musst be a list of entries (and each entry a dictionary)!	
	For this, a simple heuristic is used: the sender and the total amount of parts
	of the SMS must be identical, and the send-times may not differ by more than
	maxdiff (which is assumed to be a datetime.timedelta).
	To mark the groups, the 'group'-field of each entry will be set (to some gid > 0).
	Entries with the same gid are assumed to be from the same SMS. Entries with 
	gid == 0 could not be linked to other SMS.
	NOTE: the algorithm assumes that only partial messages are passed into it!
	"""
	next_gid = 1 #next group-id
	for i in entries:
		for j in entries:

			if i == j: continue
			if i['sender'] != j['sender']: continue
			if i['total'] != j['total']: continue
			
			if abs(i['time'] - j['time']) < maxdiff:
				if i['group'] and j['group'] and i['group'] != j['group']:
					raise Exception("both groups already set!") # this should never happen, actually ;)
				elif j['group']:
					i['group'] = j['group']
				elif i['group']:
					j['group'] = i['group']
				else:
					gid = next_gid
					next_gid += 1
					j['group'] = gid
					i['group'] = gid
	return (entries, next_gid)


def assemble_sms(entries):
	""" Assembles an SMS out of a list of entries.
	The entries are assumed to be from the same SMS. The send-time
	of the SMS will be the send-time of the first part of the linked SMS."""
	
	entries.sort(lambda x, y: x['part'] - y['part'])
	sms = dict({'sender': entries[0]['sender'], 'time': entries[0]['time']})
	text = ""
	for e in entries:
		text += e['text']
	sms['text'] = text
	return sms


def handle_unfinished_group(entries):
	""" Assembles an SMS that has missing parts."""
	entries.sort(lambda x, y: x['part'] - y['part'])
	sms = dict({'sender': entries[0]['sender'], 'time': entries[0]['time']})
	text = ""
	
	prev = 0
	for e in entries:
		text += e['text'] if (prev + 1 == e['part']) else "<#missing part#>"
	sms['text'] = text
	return sms	
	
	
def concatenate_linked_messages(db, do_insert_incompletes):
	"""
	Returns tuple consisting of a list of all the SMS that were linked 
	(i.e. have more than one part) and a list of all the rawdata-ids that
	are part of these IDs.
	
	do_insert_incompletes determines if incomplete messages (i.e., mutli-
	messages SMS that could not be completely reassembled) will also be stored.
	"""
	
	entries = get_partial_messages(db)
	(entries, max_gid) = group_partial_messages(entries)
	
	# these didn't find a group:
	for i in entries:
		if not i['group']: 
			print "Could link ID to a group:", i['id']
					
	groups = {}
	for gid in range(max_gid):
		for j in entries:
			if j['group'] == gid:
				sys.stdout.flush()
				groups.setdefault(gid, []).append(j)
				
	def show_incorrect_group_size_error(g):
		""" Short helper function ;)"""
		print "ERROR: number of parts not correct for sms. IDs:",
		print ", ".join([str(x['id']) for x in g]),
		print "(expected:", g[0]['total'], ")"	

	smslist = []
	ids = []
	for g in groups.values():
		if len(g) > g[0]['total']:
			show_incorrect_group_size_error(g)
		elif len(g) < g[0]['total']:
			if do_insert_incompletes:
				smslist.append(handle_unfinished_group(g))
				ids.extend([x['id'] for x in g])
			else:
				show_incorrect_group_size_error(g)
		else:
			smslist.append(assemble_sms(g))
			ids.extend([x['id'] for x in g])
			
	return (smslist, ids)


try:
	opts, args = getopt.getopt(sys.argv[1:], "hi", ["help", "insert-incomplete"])
except getopt.GetoptError, err:
	print str(err)
	print __doc__
	sys.exit(2)
	
do_insert_incompletes = False
for o,a in opts:
	if o in ("-h", "--help"):
		print __doc__
		sys.exit()
	elif o in ("-i", "--insert-partial"):
		do_insert_incompletes = True
	

db = setup_db()
parse_input(db, sys.stdin)

# first, insert those entries from rawdata that were complete SMS to begin with.
db.execute("""INSERT INTO inbox(sender, text, time)
		      SELECT sender, text, time FROM rawdata WHERE total = 1 AND processed = 0;""")
db.execute("UPDATE rawdata SET processed = 1 WHERE total = 1 AND processed = 0")
	
stmt = "INSERT INTO inbox(sender, text, time) VALUES(:sender, :text, :time)"
(smslist, ids) = concatenate_linked_messages(db, do_insert_incompletes)

for sms in smslist:
		db.execute(stmt, sms)

stmt = """
	UPDATE rawdata
	SET processed = 1
	WHERE id IN (""" + ", ".join([str(i) for i in ids]) + ")"
db.execute(stmt)
db.commit()


