sandrinolli
15-03-2010, 19:03
Allora ho questo file t2v.c :
/* Set these parameters to control conversion and preferences */
long int nout=100; /* print informations every nout tokens processed */
long int nsave=500; /* save every nsave tokens processed */
int nocase=1; /* ignore case */
int precision=2; /* no. of digits for printing matrix */
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <stdlib.h>
#define BUFLEN 4096
#define WLEN 30
double totall=0.;
struct st_node {
char wrd[WLEN];
long int lbl;
int tf; /* term freq (n occurrences in a doc) */
int df; /* document freq (n docs with occurrence) */
int cf; /* collection freq (total occurrences in coll) */
struct st_node *l, *r;
int isfirst;
};
long int ucnt, rdcnt, wcnt;
long int ix;
double entr;
FILE *ftp;
char fln[BUFLEN];
char fdn[]="dict.a"; FILE *fdp;
char fmn[]="tdm.dat"; FILE *fmp;
char w[WLEN];
char **dcv;
long int dc=1;
long int dcn=0;
double *v;
struct st_node * newnode(void);
struct st_node * deltree(struct st_node *);
void printtree(FILE *,struct st_node *);
void addcode(const char *, struct st_node **);
long int encode(const char *, struct st_node **);
double entropy(struct st_node*);
long int words(struct st_node*);
void docstart(struct st_node*);
char** getdocnames(char*fln);
void dict2vect(struct st_node *,double *);
double *vect(int);
char *string(int);
int main(int argc, char *argv[])
{
struct st_node *dicttree=NULL;
long int code;
/* start command line */
if (argc>2)
fprintf(stderr,"WARNING - extra strings on command line\n");
else if (argc==1)
fprintf(stderr,"ERROR - no filename given on command line\n"),exit(-3);
strcpy(fln,argv[1]);
/* end command line */
/* start initializing algorithm */
rdcnt=0;
ucnt=0;
dcv=getdocnames(fln);
/* end initializing algorithm */
--------------------------------------------------------------------------------
for(dc=1;dc<=dcn;dc++) {
/* start read file into binary tree */
docstart(dicttree);
if ((ftp=fopen(dcv[dc],"r"))==NULL)
fprintf(stderr,"ERROR #1 - system reports:\n"),perror(dcv[dc]), exit(-4);
while (getword(&w,ftp)) {
rdcnt++;
addcode(w,&dicttree);
if (rdcnt%nout==0) {
fprintf(stderr,"\r %12li read - %12li unique MEM %.1lf KB ",
rdcnt,ucnt,totall);
/*DEBUG*/fprintf(stderr,"");
}
if (rdcnt%nsave==0) {
fprintf(stderr,"\r(saving...)");
if ((fdp=fopen(fdn,"w"))==NULL)
fprintf(stderr,"ERROR #2 - system reports:\n"),perror(fdn), exit(-4);
printtree(fdp,dicttree);
fclose(fdp);
}
}
fclose(ftp);
fprintf(stderr,"\n");
/* end read file into binary tree */
}
/* start write tree on file */
if (rdcnt<=0)
fprintf(stderr,"ERROR - no words found\n"), exit(-5);
else
fprintf(stderr,"\rTotal %li tokens read - %li unique words\n",rdcnt,ucnt);
fprintf(stderr,"Total memory allocated: %.0lf kilobytes\n",totall);
if ((fdp=fopen(fdn,"w"))==NULL)
fprintf(stderr,"ERROR #3 - system reports:\n"),perror(fdn), exit(-4);
wcnt=words(dicttree);
entr=entropy(dicttree);
fprintf(fdp,"STORED %10li\n",wcnt);
fprintf(fdp,"UNIQUE %10li\n",ucnt);
fprintf(fdp,"ENTROPY %.2lf BITS \n",entr);
fprintf(fdp,"Term-------------------------- -----Label ---df ---cf\n");
printtree(fdp,dicttree);
fclose(fdp);
/* end write tree on file */
/* start initialize term-document matrix file */
v=vect(ucnt);
fprintf(stderr,"\nAllocated document vector - Total memory now %.0lf KB\n\n",
totall);
if ((fmp=fopen(fmn,"w"))==NULL)
fprintf(stderr,"ERROR #5 - system reports:\n"), perror(fmn), exit(-4);
fprintf(stderr,"\nPrinting term-document matrix...");
fprintf(fmp,"%li\n", dcn);
fprintf(fmp,"%li\n", ucnt);
/* end initialize term-document matrix file */
for(dc=1;dc<=dcn;dc++) {
/* start convert file into document vector */
docstart(dicttree);
fprintf(stderr,"\rReading document %6i...",dc);
if ((ftp=fopen(dcv[dc],"r"))==NULL)
fprintf(stderr,"ERROR #4 - system reports:\n"),perror(dcv[dc]), exit(-4);
wcnt=0;
while (getword(&w,ftp))
encode(w,&dicttree),wcnt++;
fclose(ftp);
fprintf(stderr,"converting to vector...");
ix=1;
dict2vect(dicttree,v);
fprintf(stderr,"done\n");
/* end convert file into document vector */
/* start print vector into term-document matrix */
for(ix=1;ix<=ucnt;ix++) {
fprintf(fmp,"%.*lg%c",precision,v[ix],ix==ucnt?'\n':' ');
}
fflush(fmp);
/* end print vector into term-document matrix */
}
fclose(fmp);
fprintf(stderr,"...done\n");
deltree(dicttree);
fprintf(stderr,"END OF RUN\n");
return 0;
}
void docstart(struct st_node *t)
{
if (t==NULL) return;
docstart(t->l);
t->tf=0;
t->isfirst=1;
docstart(t->r);
}
void dict2vect(struct st_node *t, double *v)
{
if (t==NULL) return;
dict2vect(t->l,v);
v[ix++]=
0?0.:(double)t->tf*log((double)dcn/(double)t->df)/(double)wcnt;
dict2vect(t->r,v);
}
struct st_node * newnode(void)
{
struct st_node *tmp;
const int memsize=1;//100;
static int imem=0;
static struct st_node *mem=NULL;
if (imem==0) {
mem=(struct st_node *)malloc(memsize*sizeof(struct st_node));
imem=memsize;
if (mem==NULL)
fprintf(stderr,"ERROR - memory allocation\n"),exit(-1);
totall+=memsize*(double)sizeof(struct st_node)/1000.;
}
tmp=mem+(--imem);
tmp->l=tmp->r=NULL;
ucnt++;
tmp->lbl=ucnt;
return tmp;
}
struct st_node *deltree(struct st_node *t)
{
if (t->l!=NULL) deltree(t->l);
if (t->r!=NULL) deltree(t->r);
free(t);
ucnt--;
return NULL;
}
void addcode(const char *s, struct st_node **t)
{
int cmp;
if ((*t)==NULL) {
*t=newnode();
strcpy((*t)->wrd,s);
(*t)->cf=1;
(*t)->df=1;
return;
}
cmp=strcmp(s,(*t)->wrd);
if (cmp==0) {
(*t)->cf++;
if((*t)->isfirst){
(*t)->df++;
(*t)->isfirst=0;
}
return;
}
if (cmp>0)
return addcode(s,&((*t)->r));
if (cmp<0)
return addcode(s,&((*t)->l));
}
long int encode(const char *s, struct st_node **t)
{
int cmp;
if ((*t)==NULL) {
fprintf(stderr,"ERROR: Trying to encode but dict tree is empty\n");
exit(-10);
}
cmp=strcmp(s,(*t)->wrd);
if (cmp==0) {
(*t)->tf++;
return (*t)->lbl;
}
if (cmp>0)
return encode(s,&((*t)->r));
if (cmp<0)
return encode(s,&((*t)->l));
}
void printtree(FILE *fp, struct st_node *t)
{
if (t==NULL)
return;
else {
printtree(fdp,t->l);
fprintf(fp,"%-30s %10li %5i %5i\n",t->wrd, t->lbl, t->df, t->cf);
printtree(fdp,t->r);
}
}
int stripsuffix(const char *s, char *w)
{
int i,j;
i=strlen(w);
j=strlen(s);
while (i>0 && j>0 && w[i]==s[j]) {
i--, j--;
}
if (i>0 && j==0 && w[i]==s[j])
w[i]='\0';
return 1;
}
int getword(char *wbuf,FILE *fp)
{
char c;
int i;
/* start leggi token */
START:
i=0;
do {
c=fgetc(fp);
} while(!isalpha(c) && !feof(fp));
if (feof(fp))
return 0;
do {
wbuf[i++]=c;
if (i>BUFLEN)
fprintf(stderr,"ERROR: buffer overflow\n"),exit(-2);
c=fgetc(fp);
} while(isalpha(c) && !feof(fp));
wbuf[i]='\0';
/* end leggi token */
if(nocase)strlwr(wbuf);
return 1;
}
long int words(struct st_node *t)
{
long int val;
if (t==NULL)
return 0.;
val = t->cf;
val=val+words(t->l);
val=val+words(t->r);
return val;
}
double entropy(struct st_node *t)
{
double val;
if (t==NULL)
return 0.;
val=(double)(t->cf)/(double)wcnt;
val=-val*log(val);// val=-val*log2(val)
val=val+entropy(t->l);
val=val+entropy(t->r);
return val;
}
char** getdocnames(char*fln)
{
char **dcv;
char nbuf[BUFLEN];
FILE *flp;
if ((flp=fopen(fln,"r"))==NULL)
fprintf(stderr,"ERROR #6 - system reports:\n"), perror(fln), exit(-4);
if(fgets(nbuf,BUFLEN,flp)==NULL) {
fprintf(stderr,"ERROR - While scanning the list of document names:\n");
fprintf(stderr,"Document name list ended prematurely\n");
exit(-9);
}
if(sscanf(nbuf,"%li",&dcn)==0) {
fprintf(stderr,"ERROR - While scanning the list of document names:\n");
fprintf(stderr,"The list should start with the number of documents\n");
exit(-7);
}
if(dcn==0) {
fprintf(stderr,"ERROR - While scanning the list of document names:\n");
fprintf(stderr,"The number of documents should be > 0\n");
exit(-8);
}
dcv=(char**)malloc((1+dcn)*sizeof(char*));
totall+=(double)((1+dcn)*sizeof(char*))/1000.;
for(dc=1;dc<=dcn;dc++) {
if(fgets(nbuf,BUFLEN,flp)==NULL) {
fprintf(stderr,"ERROR - While scanning the list of document names:\n");
fprintf(stderr,"Document name list ended prematurely\n");
exit(-9);
}
dcv[dc]=string(BUFLEN);
totall+=(strlen(nbuf)*sizeof(char))/1000.;
if(dcv[dc]==NULL)
fprintf(stderr,"ERROR - Out of memory for document name list\n"),exit(-1);
strcpy(dcv[dc],nbuf);
dcv[dc][strlen(nbuf)-1]='\0';
}
return dcv;
}
char *string(int len)
{
char *s;
s = (char *)
malloc((size_t)(len * sizeof(char)));
if(!s)
fprintf(stderr,"allocazione non riuscita in string()"),exit(-1);
else
totall+=(double)(len * sizeof(char))/1000.;
return s;
}
double *vect(int len)
{
double *v;
v = (double *)
malloc((size_t) ((len + 1) * sizeof(double)));
if(!v)
fprintf(stderr,"allocazione non riuscita in vect()"),exit(-1);
else
totall+=(double)((len + 1) * sizeof(double))/1000.;
return v;
}
Esso prende in input un file generico e crea una matrice termine-documento che indica le occorrenze del termine nel documento.
Dovei fargli caricare lista.txt e vorrei che il contenuto venisse memorizzato all interno di 1variabile.
Come potrei fare?
Dovrei inserire questo codice dove ho inserito ------------------------------------
/* Set these parameters to control conversion and preferences */
long int nout=100; /* print informations every nout tokens processed */
long int nsave=500; /* save every nsave tokens processed */
int nocase=1; /* ignore case */
int precision=2; /* no. of digits for printing matrix */
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <stdlib.h>
#define BUFLEN 4096
#define WLEN 30
double totall=0.;
struct st_node {
char wrd[WLEN];
long int lbl;
int tf; /* term freq (n occurrences in a doc) */
int df; /* document freq (n docs with occurrence) */
int cf; /* collection freq (total occurrences in coll) */
struct st_node *l, *r;
int isfirst;
};
long int ucnt, rdcnt, wcnt;
long int ix;
double entr;
FILE *ftp;
char fln[BUFLEN];
char fdn[]="dict.a"; FILE *fdp;
char fmn[]="tdm.dat"; FILE *fmp;
char w[WLEN];
char **dcv;
long int dc=1;
long int dcn=0;
double *v;
struct st_node * newnode(void);
struct st_node * deltree(struct st_node *);
void printtree(FILE *,struct st_node *);
void addcode(const char *, struct st_node **);
long int encode(const char *, struct st_node **);
double entropy(struct st_node*);
long int words(struct st_node*);
void docstart(struct st_node*);
char** getdocnames(char*fln);
void dict2vect(struct st_node *,double *);
double *vect(int);
char *string(int);
int main(int argc, char *argv[])
{
struct st_node *dicttree=NULL;
long int code;
/* start command line */
if (argc>2)
fprintf(stderr,"WARNING - extra strings on command line\n");
else if (argc==1)
fprintf(stderr,"ERROR - no filename given on command line\n"),exit(-3);
strcpy(fln,argv[1]);
/* end command line */
/* start initializing algorithm */
rdcnt=0;
ucnt=0;
dcv=getdocnames(fln);
/* end initializing algorithm */
--------------------------------------------------------------------------------
for(dc=1;dc<=dcn;dc++) {
/* start read file into binary tree */
docstart(dicttree);
if ((ftp=fopen(dcv[dc],"r"))==NULL)
fprintf(stderr,"ERROR #1 - system reports:\n"),perror(dcv[dc]), exit(-4);
while (getword(&w,ftp)) {
rdcnt++;
addcode(w,&dicttree);
if (rdcnt%nout==0) {
fprintf(stderr,"\r %12li read - %12li unique MEM %.1lf KB ",
rdcnt,ucnt,totall);
/*DEBUG*/fprintf(stderr,"");
}
if (rdcnt%nsave==0) {
fprintf(stderr,"\r(saving...)");
if ((fdp=fopen(fdn,"w"))==NULL)
fprintf(stderr,"ERROR #2 - system reports:\n"),perror(fdn), exit(-4);
printtree(fdp,dicttree);
fclose(fdp);
}
}
fclose(ftp);
fprintf(stderr,"\n");
/* end read file into binary tree */
}
/* start write tree on file */
if (rdcnt<=0)
fprintf(stderr,"ERROR - no words found\n"), exit(-5);
else
fprintf(stderr,"\rTotal %li tokens read - %li unique words\n",rdcnt,ucnt);
fprintf(stderr,"Total memory allocated: %.0lf kilobytes\n",totall);
if ((fdp=fopen(fdn,"w"))==NULL)
fprintf(stderr,"ERROR #3 - system reports:\n"),perror(fdn), exit(-4);
wcnt=words(dicttree);
entr=entropy(dicttree);
fprintf(fdp,"STORED %10li\n",wcnt);
fprintf(fdp,"UNIQUE %10li\n",ucnt);
fprintf(fdp,"ENTROPY %.2lf BITS \n",entr);
fprintf(fdp,"Term-------------------------- -----Label ---df ---cf\n");
printtree(fdp,dicttree);
fclose(fdp);
/* end write tree on file */
/* start initialize term-document matrix file */
v=vect(ucnt);
fprintf(stderr,"\nAllocated document vector - Total memory now %.0lf KB\n\n",
totall);
if ((fmp=fopen(fmn,"w"))==NULL)
fprintf(stderr,"ERROR #5 - system reports:\n"), perror(fmn), exit(-4);
fprintf(stderr,"\nPrinting term-document matrix...");
fprintf(fmp,"%li\n", dcn);
fprintf(fmp,"%li\n", ucnt);
/* end initialize term-document matrix file */
for(dc=1;dc<=dcn;dc++) {
/* start convert file into document vector */
docstart(dicttree);
fprintf(stderr,"\rReading document %6i...",dc);
if ((ftp=fopen(dcv[dc],"r"))==NULL)
fprintf(stderr,"ERROR #4 - system reports:\n"),perror(dcv[dc]), exit(-4);
wcnt=0;
while (getword(&w,ftp))
encode(w,&dicttree),wcnt++;
fclose(ftp);
fprintf(stderr,"converting to vector...");
ix=1;
dict2vect(dicttree,v);
fprintf(stderr,"done\n");
/* end convert file into document vector */
/* start print vector into term-document matrix */
for(ix=1;ix<=ucnt;ix++) {
fprintf(fmp,"%.*lg%c",precision,v[ix],ix==ucnt?'\n':' ');
}
fflush(fmp);
/* end print vector into term-document matrix */
}
fclose(fmp);
fprintf(stderr,"...done\n");
deltree(dicttree);
fprintf(stderr,"END OF RUN\n");
return 0;
}
void docstart(struct st_node *t)
{
if (t==NULL) return;
docstart(t->l);
t->tf=0;
t->isfirst=1;
docstart(t->r);
}
void dict2vect(struct st_node *t, double *v)
{
if (t==NULL) return;
dict2vect(t->l,v);
v[ix++]=
0?0.:(double)t->tf*log((double)dcn/(double)t->df)/(double)wcnt;
dict2vect(t->r,v);
}
struct st_node * newnode(void)
{
struct st_node *tmp;
const int memsize=1;//100;
static int imem=0;
static struct st_node *mem=NULL;
if (imem==0) {
mem=(struct st_node *)malloc(memsize*sizeof(struct st_node));
imem=memsize;
if (mem==NULL)
fprintf(stderr,"ERROR - memory allocation\n"),exit(-1);
totall+=memsize*(double)sizeof(struct st_node)/1000.;
}
tmp=mem+(--imem);
tmp->l=tmp->r=NULL;
ucnt++;
tmp->lbl=ucnt;
return tmp;
}
struct st_node *deltree(struct st_node *t)
{
if (t->l!=NULL) deltree(t->l);
if (t->r!=NULL) deltree(t->r);
free(t);
ucnt--;
return NULL;
}
void addcode(const char *s, struct st_node **t)
{
int cmp;
if ((*t)==NULL) {
*t=newnode();
strcpy((*t)->wrd,s);
(*t)->cf=1;
(*t)->df=1;
return;
}
cmp=strcmp(s,(*t)->wrd);
if (cmp==0) {
(*t)->cf++;
if((*t)->isfirst){
(*t)->df++;
(*t)->isfirst=0;
}
return;
}
if (cmp>0)
return addcode(s,&((*t)->r));
if (cmp<0)
return addcode(s,&((*t)->l));
}
long int encode(const char *s, struct st_node **t)
{
int cmp;
if ((*t)==NULL) {
fprintf(stderr,"ERROR: Trying to encode but dict tree is empty\n");
exit(-10);
}
cmp=strcmp(s,(*t)->wrd);
if (cmp==0) {
(*t)->tf++;
return (*t)->lbl;
}
if (cmp>0)
return encode(s,&((*t)->r));
if (cmp<0)
return encode(s,&((*t)->l));
}
void printtree(FILE *fp, struct st_node *t)
{
if (t==NULL)
return;
else {
printtree(fdp,t->l);
fprintf(fp,"%-30s %10li %5i %5i\n",t->wrd, t->lbl, t->df, t->cf);
printtree(fdp,t->r);
}
}
int stripsuffix(const char *s, char *w)
{
int i,j;
i=strlen(w);
j=strlen(s);
while (i>0 && j>0 && w[i]==s[j]) {
i--, j--;
}
if (i>0 && j==0 && w[i]==s[j])
w[i]='\0';
return 1;
}
int getword(char *wbuf,FILE *fp)
{
char c;
int i;
/* start leggi token */
START:
i=0;
do {
c=fgetc(fp);
} while(!isalpha(c) && !feof(fp));
if (feof(fp))
return 0;
do {
wbuf[i++]=c;
if (i>BUFLEN)
fprintf(stderr,"ERROR: buffer overflow\n"),exit(-2);
c=fgetc(fp);
} while(isalpha(c) && !feof(fp));
wbuf[i]='\0';
/* end leggi token */
if(nocase)strlwr(wbuf);
return 1;
}
long int words(struct st_node *t)
{
long int val;
if (t==NULL)
return 0.;
val = t->cf;
val=val+words(t->l);
val=val+words(t->r);
return val;
}
double entropy(struct st_node *t)
{
double val;
if (t==NULL)
return 0.;
val=(double)(t->cf)/(double)wcnt;
val=-val*log(val);// val=-val*log2(val)
val=val+entropy(t->l);
val=val+entropy(t->r);
return val;
}
char** getdocnames(char*fln)
{
char **dcv;
char nbuf[BUFLEN];
FILE *flp;
if ((flp=fopen(fln,"r"))==NULL)
fprintf(stderr,"ERROR #6 - system reports:\n"), perror(fln), exit(-4);
if(fgets(nbuf,BUFLEN,flp)==NULL) {
fprintf(stderr,"ERROR - While scanning the list of document names:\n");
fprintf(stderr,"Document name list ended prematurely\n");
exit(-9);
}
if(sscanf(nbuf,"%li",&dcn)==0) {
fprintf(stderr,"ERROR - While scanning the list of document names:\n");
fprintf(stderr,"The list should start with the number of documents\n");
exit(-7);
}
if(dcn==0) {
fprintf(stderr,"ERROR - While scanning the list of document names:\n");
fprintf(stderr,"The number of documents should be > 0\n");
exit(-8);
}
dcv=(char**)malloc((1+dcn)*sizeof(char*));
totall+=(double)((1+dcn)*sizeof(char*))/1000.;
for(dc=1;dc<=dcn;dc++) {
if(fgets(nbuf,BUFLEN,flp)==NULL) {
fprintf(stderr,"ERROR - While scanning the list of document names:\n");
fprintf(stderr,"Document name list ended prematurely\n");
exit(-9);
}
dcv[dc]=string(BUFLEN);
totall+=(strlen(nbuf)*sizeof(char))/1000.;
if(dcv[dc]==NULL)
fprintf(stderr,"ERROR - Out of memory for document name list\n"),exit(-1);
strcpy(dcv[dc],nbuf);
dcv[dc][strlen(nbuf)-1]='\0';
}
return dcv;
}
char *string(int len)
{
char *s;
s = (char *)
malloc((size_t)(len * sizeof(char)));
if(!s)
fprintf(stderr,"allocazione non riuscita in string()"),exit(-1);
else
totall+=(double)(len * sizeof(char))/1000.;
return s;
}
double *vect(int len)
{
double *v;
v = (double *)
malloc((size_t) ((len + 1) * sizeof(double)));
if(!v)
fprintf(stderr,"allocazione non riuscita in vect()"),exit(-1);
else
totall+=(double)((len + 1) * sizeof(double))/1000.;
return v;
}
Esso prende in input un file generico e crea una matrice termine-documento che indica le occorrenze del termine nel documento.
Dovei fargli caricare lista.txt e vorrei che il contenuto venisse memorizzato all interno di 1variabile.
Come potrei fare?
Dovrei inserire questo codice dove ho inserito ------------------------------------