Visualizzazione dei risultati da 1 a 3 su 3

Discussione: [C] Caricamento file

  1. #1

    [C] Caricamento file

    Allora ho questo file t2v.c :

    /* Set these parameters to control conversion and preferences */
    long int nout=100; /* print informations every nout tokens processed */
    long int nsave=500; /* save every nsave tokens processed */
    int nocase=1; /* ignore case */
    int precision=2; /* no. of digits for printing matrix */

    #include <stdio.h>
    #include <string.h>
    #include <math.h>
    #include <stdlib.h>

    #define BUFLEN 4096
    #define WLEN 30

    double totall=0.;

    struct st_node {
    char wrd[WLEN];
    long int lbl;
    int tf; /* term freq (n occurrences in a doc) */
    int df; /* document freq (n docs with occurrence) */
    int cf; /* collection freq (total occurrences in coll) */
    struct st_node *l, *r;
    int isfirst;
    };

    long int ucnt, rdcnt, wcnt;
    long int ix;
    double entr;
    FILE *ftp;
    char fln[BUFLEN];
    char fdn[]="dict.a"; FILE *fdp;
    char fmn[]="tdm.dat"; FILE *fmp;
    char w[WLEN];
    char **dcv;
    long int dc=1;
    long int dcn=0;
    double *v;

    struct st_node * newnode(void);
    struct st_node * deltree(struct st_node *);
    void printtree(FILE *,struct st_node *);
    void addcode(const char *, struct st_node **);
    long int encode(const char *, struct st_node **);
    double entropy(struct st_node*);
    long int words(struct st_node*);
    void docstart(struct st_node*);
    char** getdocnames(char*fln);
    void dict2vect(struct st_node *,double *);
    double *vect(int);
    char *string(int);

    int main(int argc, char *argv[])
    {
    struct st_node *dicttree=NULL;
    long int code;

    /* start command line */
    if (argc>2)
    fprintf(stderr,"WARNING - extra strings on command line\n");
    else if (argc==1)
    fprintf(stderr,"ERROR - no filename given on command line\n"),exit(-3);
    strcpy(fln,argv[1]);
    /* end command line */

    /* start initializing algorithm */
    rdcnt=0;
    ucnt=0;
    dcv=getdocnames(fln);
    /* end initializing algorithm */

    --------------------------------------------------------------------------------

    for(dc=1;dc<=dcn;dc++) {
    /* start read file into binary tree */
    docstart(dicttree);
    if ((ftp=fopen(dcv[dc],"r"))==NULL)
    fprintf(stderr,"ERROR #1 - system reports:\n"),perror(dcv[dc]), exit(-4);
    while (getword(&w,ftp)) {
    rdcnt++;
    addcode(w,&dicttree);
    if (rdcnt%nout==0) {
    fprintf(stderr,"\r %12li read - %12li unique MEM %.1lf KB ",
    rdcnt,ucnt,totall);
    /*DEBUG*/fprintf(stderr,"");
    }
    if (rdcnt%nsave==0) {
    fprintf(stderr,"\r(saving...)");
    if ((fdp=fopen(fdn,"w"))==NULL)
    fprintf(stderr,"ERROR #2 - system reports:\n"),perror(fdn), exit(-4);
    printtree(fdp,dicttree);
    fclose(fdp);
    }
    }
    fclose(ftp);
    fprintf(stderr,"\n");
    /* end read file into binary tree */
    }

    /* start write tree on file */
    if (rdcnt<=0)
    fprintf(stderr,"ERROR - no words found\n"), exit(-5);
    else
    fprintf(stderr,"\rTotal %li tokens read - %li unique words\n",rdcnt,ucnt);
    fprintf(stderr,"Total memory allocated: %.0lf kilobytes\n",totall);
    if ((fdp=fopen(fdn,"w"))==NULL)
    fprintf(stderr,"ERROR #3 - system reports:\n"),perror(fdn), exit(-4);
    wcnt=words(dicttree);
    entr=entropy(dicttree);
    fprintf(fdp,"STORED %10li\n",wcnt);
    fprintf(fdp,"UNIQUE %10li\n",ucnt);
    fprintf(fdp,"ENTROPY %.2lf BITS \n",entr);
    fprintf(fdp,"Term-------------------------- -----Label ---df ---cf\n");
    printtree(fdp,dicttree);
    fclose(fdp);
    /* end write tree on file */

    /* start initialize term-document matrix file */
    v=vect(ucnt);
    fprintf(stderr,"\nAllocated document vector - Total memory now %.0lf KB\n\n",
    totall);
    if ((fmp=fopen(fmn,"w"))==NULL)
    fprintf(stderr,"ERROR #5 - system reports:\n"), perror(fmn), exit(-4);
    fprintf(stderr,"\nPrinting term-document matrix...");
    fprintf(fmp,"%li\n", dcn);
    fprintf(fmp,"%li\n", ucnt);
    /* end initialize term-document matrix file */

    for(dc=1;dc<=dcn;dc++) {
    /* start convert file into document vector */
    docstart(dicttree);
    fprintf(stderr,"\rReading document %6i...",dc);
    if ((ftp=fopen(dcv[dc],"r"))==NULL)
    fprintf(stderr,"ERROR #4 - system reports:\n"),perror(dcv[dc]), exit(-4);
    wcnt=0;
    while (getword(&w,ftp))
    encode(w,&dicttree),wcnt++;
    fclose(ftp);
    fprintf(stderr,"converting to vector...");
    ix=1;
    dict2vect(dicttree,v);
    fprintf(stderr,"done\n");
    /* end convert file into document vector */

    /* start print vector into term-document matrix */
    for(ix=1;ix<=ucnt;ix++) {
    fprintf(fmp,"%.*lg%c",precision,v[ix],ix==ucnt?'\n':' ');
    }
    fflush(fmp);
    /* end print vector into term-document matrix */
    }
    fclose(fmp);
    fprintf(stderr,"...done\n");

    deltree(dicttree);
    fprintf(stderr,"END OF RUN\n");
    return 0;
    }

    void docstart(struct st_node *t)
    {
    if (t==NULL) return;
    docstart(t->l);
    t->tf=0;
    t->isfirst=1;
    docstart(t->r);
    }

    void dict2vect(struct st_node *t, double *v)
    {
    if (t==NULL) return;
    dict2vect(t->l,v);
    v[ix++]=
    0?0.double)t->tf*log((double)dcn/(double)t->df)/(double)wcnt;
    dict2vect(t->r,v);
    }

    struct st_node * newnode(void)
    {
    struct st_node *tmp;
    const int memsize=1;//100;
    static int imem=0;
    static struct st_node *mem=NULL;

    if (imem==0) {
    mem=(struct st_node *)malloc(memsize*sizeof(struct st_node));
    imem=memsize;
    if (mem==NULL)
    fprintf(stderr,"ERROR - memory allocation\n"),exit(-1);
    totall+=memsize*(double)sizeof(struct st_node)/1000.;
    }
    tmp=mem+(--imem);
    tmp->l=tmp->r=NULL;
    ucnt++;
    tmp->lbl=ucnt;
    return tmp;
    }

    struct st_node *deltree(struct st_node *t)
    {
    if (t->l!=NULL) deltree(t->l);
    if (t->r!=NULL) deltree(t->r);
    free(t);
    ucnt--;
    return NULL;
    }

    void addcode(const char *s, struct st_node **t)
    {
    int cmp;

    if ((*t)==NULL) {
    *t=newnode();
    strcpy((*t)->wrd,s);
    (*t)->cf=1;
    (*t)->df=1;
    return;
    }
    cmp=strcmp(s,(*t)->wrd);
    if (cmp==0) {
    (*t)->cf++;
    if((*t)->isfirst){
    (*t)->df++;
    (*t)->isfirst=0;
    }
    return;
    }
    if (cmp>0)
    return addcode(s,&((*t)->r));
    if (cmp<0)
    return addcode(s,&((*t)->l));
    }

    long int encode(const char *s, struct st_node **t)
    {
    int cmp;

    if ((*t)==NULL) {
    fprintf(stderr,"ERROR: Trying to encode but dict tree is empty\n");
    exit(-10);
    }
    cmp=strcmp(s,(*t)->wrd);
    if (cmp==0) {
    (*t)->tf++;
    return (*t)->lbl;
    }
    if (cmp>0)
    return encode(s,&((*t)->r));
    if (cmp<0)
    return encode(s,&((*t)->l));
    }

    void printtree(FILE *fp, struct st_node *t)
    {
    if (t==NULL)
    return;
    else {
    printtree(fdp,t->l);
    fprintf(fp,"%-30s %10li %5i %5i\n",t->wrd, t->lbl, t->df, t->cf);
    printtree(fdp,t->r);
    }
    }

    int stripsuffix(const char *s, char *w)
    {
    int i,j;

    i=strlen(w);
    j=strlen(s);
    while (i>0 && j>0 && w[i]==s[j]) {
    i--, j--;
    }
    if (i>0 && j==0 && w[i]==s[j])
    w[i]='\0';
    return 1;
    }

    int getword(char *wbuf,FILE *fp)
    {
    char c;
    int i;
    /* start leggi token */
    START:
    i=0;
    do {
    c=fgetc(fp);
    } while(!isalpha(c) && !feof(fp));
    if (feof(fp))
    return 0;
    do {
    wbuf[i++]=c;
    if (i>BUFLEN)
    fprintf(stderr,"ERROR: buffer overflow\n"),exit(-2);
    c=fgetc(fp);
    } while(isalpha(c) && !feof(fp));
    wbuf[i]='\0';
    /* end leggi token */
    if(nocase)strlwr(wbuf);
    return 1;
    }

    long int words(struct st_node *t)
    {
    long int val;

    if (t==NULL)
    return 0.;
    val = t->cf;
    val=val+words(t->l);
    val=val+words(t->r);
    return val;
    }

    double entropy(struct st_node *t)
    {
    double val;

    if (t==NULL)
    return 0.;
    val=(double)(t->cf)/(double)wcnt;
    val=-val*log(val);// val=-val*log2(val)
    val=val+entropy(t->l);
    val=val+entropy(t->r);
    return val;
    }

    char** getdocnames(char*fln)
    {
    char **dcv;
    char nbuf[BUFLEN];
    FILE *flp;

    if ((flp=fopen(fln,"r"))==NULL)
    fprintf(stderr,"ERROR #6 - system reports:\n"), perror(fln), exit(-4);
    if(fgets(nbuf,BUFLEN,flp)==NULL) {
    fprintf(stderr,"ERROR - While scanning the list of document names:\n");
    fprintf(stderr,"Document name list ended prematurely\n");
    exit(-9);
    }
    if(sscanf(nbuf,"%li",&dcn)==0) {
    fprintf(stderr,"ERROR - While scanning the list of document names:\n");
    fprintf(stderr,"The list should start with the number of documents\n");
    exit(-7);
    }
    if(dcn==0) {
    fprintf(stderr,"ERROR - While scanning the list of document names:\n");
    fprintf(stderr,"The number of documents should be > 0\n");
    exit(-8);
    }
    dcv=(char**)malloc((1+dcn)*sizeof(char*));
    totall+=(double)((1+dcn)*sizeof(char*))/1000.;
    for(dc=1;dc<=dcn;dc++) {
    if(fgets(nbuf,BUFLEN,flp)==NULL) {
    fprintf(stderr,"ERROR - While scanning the list of document names:\n");
    fprintf(stderr,"Document name list ended prematurely\n");
    exit(-9);
    }
    dcv[dc]=string(BUFLEN);
    totall+=(strlen(nbuf)*sizeof(char))/1000.;
    if(dcv[dc]==NULL)
    fprintf(stderr,"ERROR - Out of memory for document name list\n"),exit(-1);
    strcpy(dcv[dc],nbuf);
    dcv[dc][strlen(nbuf)-1]='\0';
    }
    return dcv;
    }

    char *string(int len)
    {
    char *s;

    s = (char *)
    malloc((size_t)(len * sizeof(char)));
    if(!s)
    fprintf(stderr,"allocazione non riuscita in string()"),exit(-1);
    else
    totall+=(double)(len * sizeof(char))/1000.;
    return s;
    }

    double *vect(int len)
    {
    double *v;

    v = (double *)
    malloc((size_t) ((len + 1) * sizeof(double)));
    if(!v)
    fprintf(stderr,"allocazione non riuscita in vect()"),exit(-1);
    else
    totall+=(double)((len + 1) * sizeof(double))/1000.;
    return v;
    }
    Esso prende in input un file generico e crea una matrice termine-documento che indica le occorrenze del termine nel documento.

    Dovei fargli caricare lista.txt e vorrei che il contenuto venisse memorizzato all interno di 1variabile.
    Come potrei fare?
    Dovrei inserire questo codice dove ho inserito ------------------------------------

  2. #2
    il file lista.txt ha una lista di stringhe...ecco il contenuto :
    lista.txt:
    able
    about
    above
    abst
    accordance
    according
    accordingly
    across
    act
    actually
    added
    adj
    adopted
    affected
    affecting
    affects
    after
    afterwards
    again
    against
    ah
    all
    almost
    alone
    along
    already
    also
    although
    always
    am
    among
    amongst
    an
    and
    announce
    another
    any
    anybody
    anyhow
    anymore
    anyone
    anything
    anyway
    anyways
    anywhere
    apparently
    approximately
    are
    aren
    arent
    arise
    around
    as
    aside
    ask
    asking
    at
    auth
    available
    away
    awfully
    b
    back
    be
    became
    because
    become
    becomes
    becoming
    been
    before
    beforehand
    begin
    beginning
    beginnings
    begins
    behind
    being
    believe
    below
    beside
    besides
    between
    beyond
    biol
    both
    brief
    briefly
    but
    by
    c
    ca
    came
    can
    cannot
    cant
    cause
    causes
    certain
    certainly
    co
    com
    come
    comes
    contain
    containing
    contains
    could
    couldnt
    d
    date
    did
    didnt
    different
    do
    does
    doesnt
    doing
    done
    dont
    down
    downwards
    due
    during
    e
    each
    ed
    edu
    effect
    eg
    eight
    eighty
    either
    else
    elsewhere
    end
    ending
    enough
    especially
    et
    et-al
    etc
    even
    ever
    every
    everybody
    everyone
    everything
    everywhere
    ex
    except
    f
    far
    few
    ff
    fifth
    first
    five
    fix
    followed
    following
    follows
    for
    former
    formerly
    forth
    found
    four
    from
    further
    furthermore
    g
    gave
    get
    gets
    getting
    give
    given
    gives
    giving
    go
    goes
    gone
    got
    gotten
    h
    had
    happens
    hardly
    has
    hasnt
    have
    havent
    having
    he
    hed
    hence
    her
    here
    hereafter
    hereby
    herein
    heres
    hereupon
    hers
    herself
    hes
    hi
    hid
    him
    himself
    his
    hither
    home
    how
    howbeit
    however
    hundred
    i
    id
    ie
    if
    i'll
    im
    immediate
    immediately
    importance
    important
    in
    inc
    indeed
    index
    information
    instead
    into
    invention
    inward
    is
    isnt
    it
    itd
    it'll
    its
    itself
    i've
    j
    just
    k
    keep
    keeps
    kept
    keys
    kg
    km
    know
    known
    knows
    l
    largely
    last
    lately
    later
    latter
    latterly
    least
    less
    lest
    let
    lets
    like
    liked
    likely
    line
    little
    'll
    look
    looking
    looks
    ltd
    m
    made
    mainly
    make
    makes
    many
    may
    maybe
    me
    mean
    means
    meantime
    meanwhile
    merely
    mg
    might
    million
    miss
    ml
    more
    moreover
    most
    mostly
    mr
    mrs
    much
    mug
    must
    my
    myself
    n
    na
    name
    namely
    nay
    nd
    near
    nearly
    necessarily
    necessary
    need
    needs
    neither
    never
    nevertheless
    new
    next
    nine
    ninety
    no
    nobody
    non
    none
    nonetheless
    noone
    nor
    normally
    nos
    not
    noted
    nothing
    now
    nowhere
    o
    obtain
    obtained
    obviously
    of
    off
    often
    oh
    ok
    okay
    old
    omitted
    on
    once
    one
    ones
    only
    onto
    or
    ord
    other
    others
    otherwise
    ought
    our
    ours
    ourselves
    out
    outside
    over
    overall
    owing
    own
    p
    page
    pages
    part
    particular
    particularly
    past
    per
    perhaps
    placed
    please
    plus
    poorly
    possible
    possibly
    potentially
    pp
    predominantly
    present
    previously
    primarily
    probably
    promptly
    proud
    provides
    put
    q
    que
    quickly
    quite
    qv
    r
    ran
    rather
    rd
    re
    readily
    really
    recent
    recently
    ref
    refs
    regarding
    regardless
    regards
    related
    relatively
    research
    respectively
    resulted
    resulting
    results
    right
    run
    s
    said
    same
    saw
    say
    saying
    says
    sec
    section
    see
    seeing
    seem
    seemed
    seeming
    seems
    seen
    self
    selves
    sent
    seven
    several
    shall
    she
    shed
    she'll
    shes
    should
    shouldnt
    show
    showed
    shown
    showns
    shows
    significant
    significantly
    similar
    similarly
    since
    six
    slightly
    so
    some
    somebody
    somehow
    someone
    somethan
    something
    sometime
    sometimes
    somewhat
    somewhere
    soon
    sorry
    specifically
    specified
    specify
    specifying
    state
    states
    still
    stop
    strongly
    sub
    substantially
    successfully
    such
    sufficiently
    suggest
    sup
    sure
    t
    take
    taken
    taking
    tell
    tends
    th
    than
    thank
    thanks
    thanx
    that
    that'll
    thats
    that've
    the
    their
    theirs
    them
    themselves
    then
    thence
    there
    thereafter
    thereby
    thered
    therefore
    therein
    there'll
    thereof
    therere
    theres
    thereto
    thereupon
    there've
    these
    they
    theyd
    they'll
    theyre
    they've
    think
    this
    those
    thou
    though
    thoughh
    thousand
    throug
    through
    throughout
    thru
    thus
    til
    tip
    to
    together
    too
    took
    toward
    towards
    tried
    tries
    truly
    try
    trying
    ts
    twice
    two
    u
    un
    under
    unfortunately
    unless
    unlike
    unlikely
    until
    unto
    up
    upon
    ups
    us
    use
    used
    useful
    usefully
    usefulness
    uses
    using
    usually
    v
    value
    various
    've
    very
    via
    viz
    vol
    vols
    vs
    w
    want
    wants
    was
    wasnt
    way
    we
    wed
    welcome
    we'll
    went
    were
    werent
    we've
    what
    whatever
    what'll
    whats
    when
    whence
    whenever
    where
    whereafter
    whereas
    whereby
    wherein
    wheres
    whereupon
    wherever
    whether
    which
    while
    whim
    whither
    who
    whod
    whoever
    whole
    who'll
    whom
    whomever
    whos
    whose
    why
    widely
    willing
    wish
    with
    within
    without
    wont
    words
    world
    would
    wouldnt
    www
    x
    y
    yes
    yet
    you
    youd
    you'll
    your
    youre
    yours
    yourself
    yourselves
    you've
    z
    zero

  3. #3
    Moderatore di Programmazione L'avatar di alka
    Registrato dal
    Oct 2001
    residenza
    Reggio Emilia
    Messaggi
    24,463

    Moderazione

    Segnalo da subito la presenza di un Regolamento da leggere per conoscere le norme da seguire nella partecipazione a quest'area del forum, innanzitutto.

    In particolare, la conduzione di una discussione deve essere portata avanti con un certo criterio: non si può postare un intero listato di codice con la speranza che qualcuno lo prenda e riesca a compilarla (senza contare che alcune sequenze di caratteri sono state convertite in "smile"), e anche il file di esempio riportato poteva essere semplicemente riassunto indicando quali sono le sue caratteristiche, senza riportarlo interamente qui.

    Insomma, ci vuole un po' di buon senso.

    Suggerisco di affrontare meglio il problema innanzitutto isolando i punti in cui manca un'implementazione o dove si ha difficoltà, postando solo le parti di codice interessate e formattandole usando l'apposito tag [CODE]; inoltre; è opportuno anche aggiungere qualche riga per spiegare più dettagliatamente il problema che si riscontra, facilitando anche il compito degli utenti nel darti una mano.

    Apri una nuova discussione seguendo tutte le indicazioni fornite.

    Ciao!
    MARCO BREVEGLIERI
    Software and Web Developer, Teacher and Consultant

    Home | Blog | Delphi Podcast | Twitch | Altro...

Permessi di invio

  • Non puoi inserire discussioni
  • Non puoi inserire repliche
  • Non puoi inserire allegati
  • Non puoi modificare i tuoi messaggi
  •  
Powered by vBulletin® Version 4.2.1
Copyright © 2025 vBulletin Solutions, Inc. All rights reserved.