speice.io/assets/js/72c73938.10945791.js

1 line
26 KiB
JavaScript
Raw Permalink Normal View History

"use strict";(self.webpackChunkspeice_io=self.webpackChunkspeice_io||[]).push([["9658"],{8190:function(e,t,n){n.r(t),n.d(t,{assets:function(){return l},contentTitle:function(){return o},default:function(){return c},frontMatter:function(){return r},metadata:function(){return s},toc:function(){return h}});var s=n(5419),i=n(5893),a=n(65);let r={slug:"2016/03/tweet-like-me",title:"Tweet like me",date:new Date("2016-03-28T12:00:00.000Z"),authors:["bspeice"],tags:[]},o=void 0,l={authorsImageUrls:[void 0]},h=[{value:"The Objective",id:"the-objective",level:2},{value:"The Data",id:"the-data",level:2},{value:"The Algorithm",id:"the-algorithm",level:2},{value:"Pulling it all together",id:"pulling-it-all-together",level:2},{value:"The results",id:"the-results",level:2},{value:"Moving on from here",id:"moving-on-from-here",level:2},{value:"For further reading",id:"for-further-reading",level:2}];function d(e){let t={a:"a",annotation:"annotation",code:"code",em:"em",h2:"h2",img:"img",li:"li",math:"math",mi:"mi",mn:"mn",mo:"mo",mover:"mover",mrow:"mrow",ol:"ol",p:"p",pre:"pre",semantics:"semantics",span:"span",...(0,a.a)(),...e.components};return(0,i.jsxs)(i.Fragment,{children:[(0,i.jsx)(t.p,{children:"In which I try to create a robot that will tweet like I tweet."}),"\n",(0,i.jsx)(t.p,{children:"So, I'm taking a Machine Learning course this semester in school, and one of the topics we keep coming back to is natural language processing and the 'bag of words' data structure. That is, given a sentence:"}),"\n",(0,i.jsx)(t.p,{children:(0,i.jsx)(t.code,{children:"How much wood would a woodchuck chuck if a woodchuck could chuck wood?"})}),"\n",(0,i.jsx)(t.p,{children:"We can represent that sentence as the following list:"}),"\n",(0,i.jsx)(t.p,{children:(0,i.jsx)(t.code,{children:"{ How: 1 much: 1 wood: 2 would: 2 a: 2 woodchuck: 2 chuck: 2 if: 1 }"})}),"\n",(0,i.jsxs)(t.p,{children:["Ignoring ",(0,i.jsx)(t.em,{children:"where"})," the words happened, we're just interested in how ",(0,i.jsx)(t.em,{children:"often"}),' the words occurred. That got me thinking: I wonder what would happen if I built a robot that just imitated how often I said things? It\'s dangerous territory when computer scientists ask "what if," but I got curious enough I wanted to follow through.']}),"\n",(0,i.jsx)(t.h2,{id:"the-objective",children:"The Objective"}),"\n",(0,i.jsx)(t.p,{children:"Given an input list of Tweets, build up the following things:"}),"\n",(0,i.jsxs)(t.ol,{children:["\n",(0,i.jsx)(t.li,{children:'The distribution of starting words; since there are no "prior" words to go from, we need to treat this as a special case.'}),"\n",(0,i.jsxs)(t.li,{children:["The distribution of words given a previous word; for example, every time I use the word ",(0,i.jsx)(t.code,{children:"woodchuck"})," in the example sentence, there is a 50% chance it is followed by ",(0,i.jsx)(t.code,{children:"chuck"})," and a 50% chance it is followed by ",(0,i.jsx)(t.code,{children:"could"}),". I need this distribution for all words."]}),"\n",(0,i.jsx)(t.li,{children:"The distribution of quantity of hashtags; Do I most often use just one? Two? Do they follow something like a Poisson distribution?"}),"\n",(0,i.jsx)(t.li,{children:"Distribution of hashtags; Given a number of hashtags, what is the actual content? I'll treat hashtags as separate from the content of a tweet."}),"\n"]}),"\n",(0,i.jsx)(t.h2,{id:"the-data",children:"The Data"}),"\n",(0,i.jsx)(t.p,{children:"I'm using as input my tweet history. I don't really use Twitter anymore, but it seems like a fun use of the dataset. I'd like to eventually build this to a point where I can imitate anyone on Twitter using their last 100 tweets or so, but I'll start with this as example code."}),"\n",(0,i.jsx)(t.h2,{id:"the-algorithm",children:"The Algorithm"}),"\n",(0,i.jsxs)(t.p,{children:["I'll be using the ",(0,i.jsx)(t.a,{href:"http://www.nltk.org/",children:"NLTK"})," library for doing a lot of the heavy lifting. First, let's import the data:"]}),"\n",(0,i.jsx)(t.pre,{children:(0,i.jsx)(t.code,{clas