#!/usr/bin/perl #updated 100304 to deal with hyperlinks in statuses leading to an extra "<description>" section $in='/Users/surly/.laststati'; open (INFO,$in); @lastcheck=<info>; close(INFO); #SIGH! Moronic Facebook feed switched -0400 to +0400 after Daylight Saving Time ended and suddenly the time stamps are off by nine hours. Stupid stupid stupid - The simplest solution would seem to be to change the local $lastcheck parameter by advancing it by nine hours (for now). $lastcheck[0]=~s/\n//; #print $lastcheck[0]; $lastcheck[0]+=32400; #print "lastcheck[0] = $lastcheck[0]\n"; $out='/Users/surly/code/bayes/statuses.txt'; open (OUTP,">$out"); use Time::Local; #my $time = timelocal(0,0,0,1,9,109); #my $string = localtime $time; #print "the big ball falls at $time => $string\n"; $checktime=time(); #print "checktime= $checktime \n"; `echo $checktime > /Users/surly/.laststati`; $DBACL_PATH="/Users/surly/Dropbox/core/.dbacl"; @old=`curl --fail -s "http://www.facebook.com/feeds/friends_status.php?id=[some long number]&key=[some alphanumeric sequence]&format=rss20" -A "Mozilla/4.0"|egrep 'title|pubDate'|grep -v "s Friends"|perl -p -i -e "s/^\s+//"|perl -p -i -e "s/\&/\&/g"|perl -p -i -e "s/\>/>/g"|perl -p -i -e "s/\</</g"|perl -p -i -e "s/\<title\>//g"`; #% dbacl -l twain -g ’^([[:alpha:]]+)’ -g ’[^[:alpha:]]([[:alpha:]]+)’ Mark_Twain.txt #The category twain which is obtained depends only on single alphabetic words in the text file Mark_Twain.txt (and computed digram statistics for prediction). For a second example, the following command builds a smoothed Markovian (word bigram) model which depends on pairs of consecutive words within each line (but pairs cannot straddle a line break): #all of this to deal with the twitter @whoever grammar AND to eliminate word pair matching since the corpus is so small. $smush="@old"; #$smush=~s/\n//g; #print $smush; @lines = split(/<\/pubDate>/, $smush); #@lines=@old; ##@list = split(/<item>/, $tumblr); ##$latest=$list[1]; ##$latest=~s/^.*<description>//;$latest=~s/<\/description>//; ##$latest=~s/<link>.*<\/link>//; ##$latest=~s/<guid>.*<\/guid><pubdate>/ - /; ##$latest=~s/<\/pubDate><\/item>//; ##$latest=~s/ -0400.*//; ###$br=`perl /Users/surly/bin/tumstati`; ###$br=~s/ \(tumblr\)//; ###push(@lines,$br); #Is there anything new at all? Could just compare # <lastbuilddate>Sun, 30 Aug 2009 09:42:19 -0400</lastBuildDate> #to lastcheck #09.08.30 #some weird error #Day '' out of range 1..31 at /Users/surly/code/bayes/stati line 108 #has just cropped up because the program is trying to parse an n+1 th status #when there are only n. I do not know why, but I am just going to limit it by #stopping the loop when time<lastchecked $time=99999999999; #$lastcheck[0]=0; #$i=0; #$truth=(($time>$lastcheck[0])&&($lines[$i]));#($lines[$i])& #print "$time > $lastcheck[0] = $truth $lines[$i]"; #&($time>$lastcheck[0]) $i=0; while(($lines[$i])&&($time>$lastcheck[0])) { $lines[$i]=~s/\<description\>.*/[hyperlinked]/g; $lines[$i]=~s/\s?\<\/title\>//g; $lines[$i]=~s/\n//g; $date=$lines[$i]; $date=~s/.*<pubdate>//; $date=~s/-0400.*//;#Daylight Savings Time $date=~s/\+0400.*//; @part=split(/ /,$date); $day=$part[1];$month=$part[2];$year=$part[3];$clocktime=$part[4]; @item=split(/:/,$clocktime); $sec=$item[2];$min=$item[1];$hour=$item[0]; # print $date; # print "day = $day\n"; # print "month=$month\n"; # print $clocktime; # print "item0 = $item[0]\n"; # print "month=$month\n"; # print "sec=$sec"; # print "min=$min"; # print "hour=$hour\n"; if ( $part[2] eq "Jan" ) { $m = 0 } elsif ( $part[2] eq "Feb" ) { $m = 1 } elsif ( $part[2] eq "Mar" ) { $m = 2 } elsif ( $part[2] eq "Apr" ) { $m = 3 } elsif ( $part[2] eq "May" ) { $m = 4 } elsif ( $part[2] eq "Jun" ) { $m = 5 } elsif ( $part[2] eq "Jul" ) { $m = 6 } elsif ( $part[2] eq "Aug" ) { $m = 7 } elsif ( $part[2] eq "Sep" ) { $m = 8 } elsif ( $part[2] eq "Oct" ) { $m = 9 } elsif ( $part[2] eq "Nov" ) { $m = 10 } elsif ( $part[2] eq "Dec" ) { $m = 11 }; if ($day) { $time = timelocal($sec,$min,$hour,$day,$m,$year);} print "$time: $lines[$i] (Internal testing)"; print "$time $lastcheck[0] (Internal testing)"; print"\n"; if ($time>$lastcheck[0]) { $lines[$i]=~s/-0400$//;$lines[$i]=~s/\+0400$//; $lines[$i]=~s/(\s)+\[hyperlinked\]/ [hyperlinked]/g; $lines[$i]=~s/(\s)+$/)\n/; $lines[$i]=~s/(\s)+<pubdate>/ (/; if (($lines[$i]=~/'/)&($lines[$i]=~/"/)) { $lines[$i]=~s/'/_/g;$lines[$i]=~s/ _s/_s/g;} else { $lines[$i]=~s/'/\'/g;$lines[$i]=~s/ \'s/'s/g;} #the really precise way to deal with this case would be to #separately print bits with double quotes using single quotes #and vice versa. It might require some kind of loop though, #first breaking the string into substrings where the quotes are. #print "substring1";print 'substring2'; print "substring3" $lines[$i]=~s/"/\"/g; $lines[$i]=~s/^(\s)+/ /g;#kluge to get rid of extra spaces #due to deleting people from @lines $cat=`echo "$lines[$i]"|dbacl -v -c ok -c bad -c urgent`; if ((($lines[$i]!~/boring person1/)&&($lines[$i]!~/boring person2/))&&(($lines[$i]!~/boring person3/))) { print OUTP $lines[$i];} if ($cat=~/ok/) { print "$lines[$i]";} elsif ($cat=~/bad/) { print "\033[0;37;48m$lines[$i]"; print "\033[0m";} elsif ($cat=~/urgent/) { print "\033[0;34;48m$lines[$i]"; print "\033[0m";} } $i++; } close (OUTP);
stati.pl
Subscribe to:
Posts (Atom)