twi.pl

#!/usr/bin/perl
$DBACL_PATH="/Users/surly/Dropbox/core/.dbacl";
#TODO:
#@lines0 = ("organ", "sichord", "harp");
#$i = @lines0; #assigns length of @lines
#print $i;
#while ($lines0[--$i])
#{print "$i $lines0[$i]";}#2 harp1 sichord0 organ-1 harp-2 sichord-3 organ 
#This is somehow just how perl arrays work. So, the push() function must be eliminated!


##############BUGS


#Re: capturing f1,f2, key presses instantlly
#Posted: 01-21-2004, 07:05 PM

#use Term::ReadKey;
#
#ReadMode 'cbreak';
#$key = ReadKey(0);
#ReadMode 'normal';

#09.10.07: length of ok.d: 481 lines, bad.d: 738 lines, urgent.d: 8 lines, all.d: 1506 lines (started after the others)

# associating the user with their words may be a good compromise. My suggestion would be to implement categorization both with and without user-word pairs, and then compare how well they categorize future messages.


@old=`/Library/Frameworks/Python.framework/Versions/2.4/bin/twyt friendstl -n 50 -q`;


$i=0;$j=0;$bcount=0;
while($old[$i])
{	$old[$i]=~s/\s?\<\/title\>//g;$old[$i]=~s/"/\"/g;
	if ($old[$i]!~/\[(\d)+\]/) #using a feature of Twyt
	{	$lines[$j-1]=$lines[$j-1] . $old[$i];}#concatenate the strings to overcome parsing problems
	else
	{	$lines[$j++]=$old[$i];}
	$i++;
}

#$length=@old;
#print $length;print "\n";

$gray=1; #controls whether posts that are predicted to be bad are printed in an almost-white, basically ignorable, gray
$feeds=1;

if ($feeds) 
{	`/Users/surly/code/bayes/stati`;}


$in='/Users/surly/code/bayes/statuses.txt';
open (INFO,$in);@more=<INFO>;close(INFO);

$i=@lines;
$ml=@more;
while (--$ml>=0)
{$lines[$i+$ml]=$more[$ml];}

#from here down it's just bazl - keep them synced
#ideally, call bazl as a subroutine

#all of the next loop could really be done in the previous loop
$score=0;$iscore=0;


$i=@lines;$i--;
while($i>=0)
{	$lines[$i]=~s/\s?\<\/title\>//g;$lines[$i]=~s/"/\"/g;
#	$lines[$i]=~s/"/\\\"/g;#yields an unwanted backslash
#it seems like somtimes they have unwanted carriage returns (or something) in them:
	$lines[$i]=~s/\n//g; $lines[$i]=~s/\r//g;
#that last line does not in fact solve the problem.
#The problem is that the twyt twitter client (at least when piped to tac) must be splitting something like this:
#"Beauty is mysterious as well as terrible. God and devil are fighting don spring there, and the battlefield is the heart of man. Fyodor Dostoevsky" so that the author's name is separate... 
#That it is always the author's name that gets split off is perplexing.

	$linesfixed[$i]=$lines[$i];
#	$linesfixed[$i]=~s/"/\\\"/g;
#	$linesfixed[$i]=~s/“/\\\"/g;$linesfixed[$i]=~s/”/\\\"/g;
#	$linesfixed[$i]=~s/'/’/g;# ` Fails because ` is also a special character.
##	$cat[$i]=`echo "$linesfixed[$i]"|dbacl -v -c ok -c bad -c urgent`;
	
	#verbose bugfixing mode
#	print "echo $linesfixed |dbacl -v -c ok -c bad -c urgent\n";
#	print "cat= $cat\n";
#	print "lines[i]= $lines[$i]\n";
	
#categorization prediction, done
	$url[$i]=$lines[$i];$url[$i]=~s/^.*http/http/;
	if ($url[$i]!~/http/) {$url[$i]=$lines[$i]; $url[$i]=~s/^.*www/www/;}
	$url[$i]=~s/\s.*$//;#only opens one url	
	$hasurl[$i]=(($url[$i]=~/http/)||($url[$i]=~/www/));
	$url[$i]=~s/,$//;$url[$i]=~s/;$//;$url[$i]=~s/\.$//; #removing punctuation
#	$hasurl[$i]=(($hasurl[$i])&&($url!~/twitpic\.com/));
	#could use better URL identification/extraction...
	#what about google.com?
	#anything.com...
	#print "url = $url[$i] hasurl = $hasurl[$i]\n";
	if ($hasurl[$i]) #first assume that it's one of those shortened URLs
	{	$expanded[$i]=`curl -s "$url[$i]" -A "Mozilla/4.0"|grep http|grep HREF|perl -p -i -e "s/This resource has permanently moved to //"`;
	#this approach works beautifully with bit.ly URLs but not tinyurl...
	#if you try to curl tinurl, it just sits there
		$expanded[$i]=~s/">The requested URL.*//;
		$expanded[$i]=~s/<A HREF="//;
		$expanded[$i]=~s/The document has moved//;
		$expanded[$i]=~s/\">here<\/A>.<P>//;
		$expanded[$i]=~s/\">.*//;

#		$notshortened[$i]=($url[$i]=~/tw2t\.com/); #skip the expansion part
#now even tinyurl expansion is screwed up!!
#		if (!$notshortened[$i])
#		{
		if ($url[$i]=~/tw2t/)
		{$title[$i]=`links "$url[$i]" -receive-timeout 60 -retries 1 -source|grep "<title>"`;
		#this works from the command line! Maybe wget could be a backup
		$title[$i]=~s/.*<title>//;$title[$i]=~s/<\/title>.*//;#copied here to deal with 
		$title[$i]=~s/»/>>/;$title[$i]=~s/\\n//g; #meta tags crepping in before <title>
		}
		else
		{	if ($expanded[$i]=~/http/)
			{
#			if (($cat[$i]=~/b/)&&($gray))
#				{	print "\033[0;37;48m$expanded[$i]";
#					print "\033[0m";}
#				else
#				{	print "\nExpanded: $expanded[$i]";}
				$url[$i]=$expanded[$i];
			}
			$html[$i]=($url[$i]!~/jpg/)&&($url[$i]!~/png/);
			$html[$i]=$html[$i]&&($url[$i]!~/mp3/);
			if (($html[$i])&&($url[$i]))
			{	$title[$i]=`links "$url[$i]" -receive-timeout 60 -retries 1 -source|grep "<title>"`;
				$title[$i]=~s/.*<title>//;$title[$i]=~s/<\/title>.*//;
				$title[$i]=~s/»/>>/;$title[$i]=~s/\\n//g;
			}
		}
	}
	$linefout[$i]=join(" ",$linesfixed[$i],$expanded[$i],$title[$i]);
	$linefout[$i]=~s/\n//g;$linefout[$i]=~s/\"/''/g;# I think this one is necessary because of the double quotes in the $cat= line below.
	$linefout[$i]=~s/\`/'/g;
	$lineout[$i]=join(" ",$lines[$i],$expanded[$i],$title[$i]);$lineout[$i]=~s/\n//g;
	$linefout[$i]=~s/"/''/g;
	open(CMD, ">$DBACL_PATH/cmd");# Open for appending
	print CMD "$linefout[$i]\n"; close(CMD);
	$cat[$i]=`cat $DBACL_PATH/cmd|dbacl -v -c ok -c bad -c urgent`;

	#whitelist
	$white=($linefout[$i]=~/someone cool/);
	$black=($linefout[$i]=~/someone boring/);
	$black=($black)||($linefout[$i]=~/via Sponsored Tweets/); 
	if ($white)
		{	$cat[$i]="ok";}
	if ($black)
		{	$cat[$i]="bad";}

	$i--;
}



#if ($hasurl[$i])
#{	
#}

########SECOND TIME THROUGH FOR DISPLAY!
#$i=$j;# this could fail if the last line was broken and $j is one too large
#$i = @lines; #assigns length of @lines
#if (!$lines[$j])	{$i=$j-1;}

$i=@lines;$i--;
while($i>=0)
 {
#####	$lines[$i]=~s/\s?\<\/title\>//g;$lines[$i]=~s/"/\"/g;
#	$lines[$i]=~s/"/\\\"/g;#yields an unwanted backslash
#it seems like somtimes they have unwanted carriage returns (or something) in them:
#####	$lines[$i]=~s/\n//g; $lines[$i]=~s/\r//g;
#that last line does not in fact solve the problem.
#The problem is that the twyt twitter client (at least when piped to tac) must be splitting something like this:
#"Beauty is mysterious as well as terrible. God and devil are fighting don spring there, and the battlefield is the heart of man. Fyodor Dostoevsky" so that the author's name is separate... 
#That it is always the author's name that gets split off is perplexing.

#could probably be better handled with print OUFTO "$whatever";


#	$linesfixed=$lines[$i];
#	$linesfixed=~s/"/\\\"/g;
#	$linesfixed=~s/“/\\\"/g;$linesfixed=~s/”/\\\"/g;
	$lines[$i]=~s/“/"/g;$lines[$i]=~s/”/"/g;
	###new attempt to route around '/" problem:
#	$linesfixed=~s/'/’/g;# ` Fails because ` is also a special character.
#	$cat=`echo "$linesfixed"|dbacl -v -c ok -c bad -c urgent`;
	
	$cat=$cat[$i];
	
	
	#verbose bugfixing mode
#	print "echo $linesfixed |dbacl -v -c ok -c bad -c urgent\n";
#	print "cat= $cat\n";
#	print "lines[i]= $lines[$i]\n";
	
	if ($cat=~/ok/)
	{	print "$i: $lines[$i]";}
	elsif ($cat=~/bad/)
		{	print "\033[0;37;48m$lines[$i]";
			print "\033[0m";$bcount++;}
#	{	print "$i:-$lines[$i]";}
		elsif ($cat=~/urgent/)
#	{	print "*$lines[$i]";}
		{	print "\033[0;34;48m$lines[$i]";
		print "\033[0m";}
		
	if ($hasurl[$i]) #first assume that it's one of those shortened URLs
	{	if ($expanded[$i]=~/http/)
		{	if (($cat[$i]=~/b/)&&($gray))
			{	print "\033[0;37;48m$expanded[$i]";
				print "\033[0m";$bcount++;}
			else
			{	print "\nExpanded URL: $expanded[$i]";}
			$url[$i]=$expanded[$i];
		}
		if (($title[$i]))
		{	if (($cat[$i]=~/b/)&&($gray))
			{	print "\033[0;37;48m$title[$i]";
				print "\033[0m";}
			else
			{	print "\n$title[$i]";}
		}
	}


##	$lines[$i]=~s/'/’/g; #weirdly an extra space shows up on the console, but not in all.d, so I'm making this change after display
	$predict=$cat[$i];
	open(ALL, ">>$DBACL_PATH/all.d");# Open for appending
	print ALL "$linefout[$i]\n"; close(ALL);
	if (($predict=~/bad/)&&($gray))
	{	print "\n";
	open(SPAM, ">>$DBACL_PATH/spam.d");# Open for appending
	print SPAM "$lines[$i]\n"; close(SPAM);
	#if it's unseen (or hard to see), it goes to the spam file
	}
	else
	{	print "\no)k, b)ad, u)rgent, open l)ink, c)opy, add to q)uotes, r)ewind, <CR>=next\n";
		$cat=<>;$cat=~s/\n//;
		if ($cat=~/o/)
		{	open(OK, ">>$DBACL_PATH/ok.d");# Open for appending
			print OK "$lineout[$i]\n"; close(OK);
			print "Categorized as ok.\n";}
		if (($cat=~/b/)||($cat=~/-/))
		{	open(BAD, ">>$DBACL_PATH/bad.d");# Open for appending
			print BAD "$lineout[$i]\n"; close(BAD);
			print "Categorized as bad.\n";}
		if ($cat=~/u/)
		{	open(URGENT, ">>$DBACL_PATH/urgent.d");# Open 
			print URGENT "$lineout[$i]\n"; close(URGENT);
			print "Categorized as urgent.\n";}
		if ($cat=~/q/)
		{	$input=`/bin/date +"%y%m%d"`;$input=~s/\n//;$input=~s/\r//;
			open(Q, ">>/Users/surly/track/q.log");
			print Q "$input $lines[$i]\n"; close(Q);
			print "Added to quotes file.\n";}
		if ($cat=~/l/)
		{	$url[$i]=~s/^.*http/http/;
			$url[$i]=~s/\s.*$//;#only opens one url
			open(LINK, ">>$DBACL_PATH/link.d");# which links have I visited? Currently not used for categorization
			print LINK "$lineout[$i]\n"; close(LINK);

			print "Opening $url[$i] ...\n";
			`open -a Safari "$url[$i]"`;}
		if ($cat=~/c/)
		{	open(CLIP, ">/Users/surly/track/clip");
			print CLIP "$lines[$i]\n"; close(CLIP);
			`/usr/bin/pbcopy < /Users/surly/track/clip`;
			print "Copied to clipboard.\n";}
	}

	
	#Once debugged, migrate this to twf
	if ($cat=~/r/)
	{	$entity=$lines[$i];$entity=~s/:.*//;
		$entity=~s/\[(\d)+\] //;
		print "rewinding $entity...\n";
		@matches=`grep $entity $DBACL_PATH/all.d |tac`;
		print $matches[0];
		$key='p'; $k=1;
		while (($key!~/q/)&&($matches[$k]))
		{	print $matches[$k];
			print "q)uit, any other key to keep rewinding\n";
			$k++;
			$key=<>;
		}

	}

#a shell script for just grabbing the first keypress:
#
#readOne () {
#tput smso
#echo "Press any key to return \c"
#tput rmso
#oldstty=`stty -g`
#stty -icanon -echo min 1 time 0
#dd bs=1 count=1 >/dev/null 2>&1
#stty "$oldstty"
#echo
#}
#Very interesting. I've never seen a way to read a single character from a #shell script before. But I want to be able to actually catch the character #that is typed. So I changed the dd statement to:
#result=`dd bs=1 count=1 2>/dev/null` 
#which seems to be working. Very cool trick!

	if ($predict=~/ok/)
		{	if ($cat=~/b/)
				{$score-=1;}
			else#if ($cat=~/o/)
				{$score+=1;}
			$iscore+=1;
		}
	elsif (($predict=~/bad/)&&($gray==0))
		{	if ($cat=~/o/)
				{$score-=1;}
			elsif ($cat=~/b/)
				{$score+=1;}
			$iscore+=1;
		}
		

	$i--;
}

#Save the posts that were marked as bad but which were actually OK.
if ((@lines)&&($bcount))
{	print "Categorize which other posts as OK? \n";
	$cat=<>;$cat=~s/\n//;
	@list = split(/ /, $cat);
	foreach $j (@list)
	{	open(OK, ">>$DBACL_PATH/ok.d");
		print OK "$lineout[$j]\n"; close(OK);
		print "ok: $lineout[$j]\n";}
}

if ($iscore>0)
{	$score=(($score+$iscore-$j)/$iscore)/2;
	print "Accuracy= $score\n";
}
if ($j>0)
{	`dbacl -l $DBACL_PATH/ok -g "^([[:alpha:]]+)" -g "[^[:alpha:]]([[:alpha:]]+)" -g "[^[:alpha:]](@[[:alpha:]]+)" $DBACL_PATH/ok.d`;
`dbacl -l $DBACL_PATH/bad -g "^([[:alpha:]]+)" -g "[^[:alpha:]]([[:alpha:]]+)" -g "[^[:alpha:]](@[[:alpha:]]+)" $DBACL_PATH/bad.d`;
`dbacl -l $DBACL_PATH/urgent -g "^([[:alpha:]]+)" -g "[^[:alpha:]]([[:alpha:]]+)" -g "[^[:alpha:]](@[[:alpha:]]+)" $DBACL_PATH/urgent.d`;
}

#^([[:alpha:]]+)
#beginning of the line, some number of letters
#[^[:alpha:]]([[:alpha:]]+)
# the notation[^a-z] means one character that is not in the set a-z (like a space) - so this grabs the Nth word, for N>1
#Therefore the ^(@[[:alpha:]]+) from `dbacl -l $DBACL_PATH/ok -g "^([[:alpha:]]+)" -g "[^[:alpha:]]([[:alpha:]]+)" -g "^(@[[:alpha:]]+)" -g "[^[:alpha:]](@[[:alpha:]]+)" $DBACL_PATH/ok.d`;
#can be eliminated, as it never arises in my formulation.
#http://dbacl.sourceforge.net/tutorial-3.html
# alphanumeric word pairs  	(^|[^[:alnum:]]) ([[:alnum:]]+) [^[:alnum:]]+ ([[:alnum:]]+) ||23  	good morning, how are

#"The last entry in the table above shows how to take word pairs as features. Such models are called bigram models, as opposed to the unigram models whose features are only single words, and they are used to capture extra information. "

#What I want to do is pick off the very first word (the name of the person who's status the rest of the line is) and pair it with every word in the sentence.