#include #include #include #include /* Program som använder Acquaintance-metoden för automatisk klassifikation av texter. Metoden baseras på de ideer som lanserades av Marc Damashek i artikeln "Gauging Similarity with n-grams: Language-Independent Categorization of Text" som publicerades i Science 10 Feb. 1995, p 843ff. Smart inläsningsrutin som bara läser varje tecken en gång, oavsett vald längd på provträngarna (n-grammerna) Tillordning av en hashnyckel till varje teststräng. Inplockning av teststrängen i en hashtabell. Iteration genom texten. Presentation av textens koordinater - möjlighet till lagring och jämförelse med annan text. Aktuella n-grammer kan lagras i en sluten länkad lista. Varje textinläsning gör att en n-gram blir färdig och kan hashas in. Samtidigt påbörjas en ny n-Gram. Specialfall erhålles i början och slutet av texten; kanske kan dessa helt enkelt strykas. Hashtabellen sparas i en vektor. (C) This program and the subroutiner/headerfiles it uses are copyrighted by Jonas Gustavsson 1996. */; const int n=5; char ch; double thr=0.07; Vec cl; Vec incl; main() { cl.insert(cl.end(),new Cluster("cluster of all texts")); cl[0]->hangon(new Hashtab(n,"../../../maclada/texter/PraktiskSvenska/PRLeon")); cl[0]->hangon(new Hashtab(n,"../../../maclada/texter/Busi/BETrans2.ascii")); cl[0]->hangon(new Hashtab(n,"../../../public_html/citat.e/kublakhan2.txt")); cl[0]->hangon(new Hashtab(n,"../../../maclada/texter/appl2.txt")); cl[0]->hangon(new Hashtab(n,"../../../maclada/texter/herrarne2.txt")); cl[0]->hangon(new Hashtab(n,"../../../maclada/texter/kallocain2.txt")); cl[0]->hangon(new Hashtab(n,"../../../maclada/texter/ucsdfreshman2.txt")); cl[0]->hangon(new Hashtab(n,"../../../maclada/texter/Ling/art.txt")); cl[0]->hangon(new Hashtab(n,"../../../maclada/texter/Ling/soc.txt")); cl[0]->hangon(new Hashtab(n,"../../../maclada/texter/Ling/med.txt")); cl[0]->hangon(new Hashtab(n,"../../../maclada/texter/Ling/sci.txt")); bool placed; int focus=0; while (focus>=0) { int old_size=cl.size(); //Number of clusters so far. if (focus>0) cl[focus]->zoomin(); //"förstörande" inzoomning int cl_con=cl[focus]->ant(); //Number of texts in studied cluster incl.resize(cl_con); Matrix S(cl_con,cl_con); for (int i=0;imedl(i); for (int j=i+1;jmedl(j); cout<<"Similarity score ("<filn(); cout<<" - "<filn()<<") : "; S.put(texti->comp(textj),j,i); S.put(S.get(j,i),i,j); cout<hangon(cl[focus]->medl(0)); //The first text starts off the first cluster. incl[0]=0; cout<<"Number of clusters: "<thr) //If text i is similar to text j, { cl[incl[j]]->hangon(cl[focus]->medl(i)); //it should be placed in incl[i]=incl[j]; //the same cluster as that text. placed=true; } j++; } if (!placed) { cout<<"No match for this text has been found! New cluster built!\n"; cl.insert(cl.end(),new Cluster("new cluster")); cl[cl.size()-1]->hangon(cl[focus]->medl(i)); incl[i]=cl.size()-1; } } cout<<"Number of clusters: "<ready(); //detta tar lång tid... cl[i]->display(); cl[i]->setname(); } cout<<"Similarity between clusters \n"; for (i=old_size;iname()<<'-'<name()<<" : "<centroid()->comp(cl[j]->centroid())<<'\n'; } } cout<<"Please enter the number of the cluster you want to check out more in detail. \n"; cout<<"(Enter -1 to exit program.) "; cin>>focus; } for (int i=0;i