百度之星 2005年预赛题目四

2012-08-26

百度之星 2005年初赛题目四第四题（共四题 100 分）：低频词过滤（ 40 分）题目描述：请编写程序，从包含大量单

百度之星 2005年初赛题目四

第四题（共四题 100 分）：低频词过滤（ 40 分）

题目描述：请编写程序，从包含大量单词的文本中删除出现次数最少的单词。如果有多
个单词都出现最少的次数，则将这些单词都删除。

输入数据：程序读入已被命名为 corpus.txt 的一个大数据量的文本文件，该文件包含英
文单词和中文单词，词与词之间以一个或多个 whitespace 分隔。（为便于调试，您可下载
测试 corpus.txt 文件，实际运行时我们会使用不同内容的输入文件。）

输出数据：在标准输出上打印删除了 corpus.txt 中出现次数最少的单词之后的文本（
词与词保持原来的顺序，仍以空格分隔）。

评分标准：程序输出结果必须正确，内存使用越少越好，程序的执行时间越快越好。

思路：因为词汇的频率是一个未知数，在统计的过程中无法探知到哪些多哪些少，必须把所有的词汇进行统计才能统计出哪些词汇词频最低。故1、构造Hash表，对出现的词汇进行统计，统计词频。2、然后获取到词频最低的列表。3、对原文进行处理，剔除低频词汇。

GHash.h

#ifndef __GHASH_H_#define __GHASH_H_#include "GList.h"#define HASHSIZE 512typedef struct _Item{   char * key;   char * value;   unsigned int count;   struct Item * next;} Item;void GHashInit();Item *  HashInSert(char * key,char * value);Item *  HashSearch(char * key);int  HashIncrease(char * key);int  HashRemove(char * key);void GetMinList(List* glist);void FreeGHash();void PrintGHash();#endif

GHash.c

#include<stdio.h>#include<stdlib.h>#include<string.h>#include "GHash.h"#include "4lib.h"//#include "GList.h"static struct Item *hashtab[HASHSIZE];static void freeItem(Item * item);static unsigned int _Hash(char *key);static unsigned int _ELFHash(char *str);void GHashInit(){  int i=0;  for(i=0; i<HASHSIZE; i++)  {  hashtab[i]=NULL;     }}Item *  HashInSert(char * key,char * value){  Item * np;  unsigned int hashval;  if((np=HashSearch(key))==NULL)  {         np=(Item *)malloc(sizeof(Item));         if(NULL==np || NULL==(np->key = (char*)strdup(key))       || NULL ==(np->value = (char*)strdup(value)) )         {             return NULL;         }         hashval = _Hash(key);         np->next = (struct Item *)hashtab[hashval];         hashtab[hashval] = (struct Item *)np;     }     else     {   free(np->value);   if((np->value=(char *)strdup(value))== NULL)   {    return NULL;         }  }  return np;}Item *  HashSearch(char * key){    Item  *np;    for(np = (Item *)hashtab[_Hash(key)];        np != NULL;        np = np->next)      if(strcmp(key,np->key) == 0)           return np;    return NULL;}int  HashIncrease(char * key){  Item * np;  unsigned int hashval;  if((np=HashSearch(key))==NULL)  {         np=(Item *)malloc(sizeof(Item));         if(NULL==np || NULL ==(np->key = (char *)strdup(key))    )         {             return -1;         }         hashval = _Hash(key);         np->value=NULL;         np->count=1;         np->next = (struct Item *)hashtab[hashval];         hashtab[hashval] = (struct Item *)np;         return 1;     }     else     {   np->count=np->count+1;   //free(np->value);   //if((np->value=strdup(value))== NULL)   //{ //   return NULL;         //}         return np->count;  }  return np->count;}int  HashRemove(char * key){}void GetMinList(List* glist){  GListInit(glist);  GListEmptyList(glist);  int curmin=100;  int i=0;  for(i=0; i<HASHSIZE; i++)  {     if(hashtab[i]!=NULL)     {    Item * tmp=hashtab[i];    while(tmp!=NULL)    { if(tmp->count==curmin) { GListAddNode(tmp->key,glist);          }        else if(tmp->count<curmin)        {  GListEmptyList(glist);  GListAddNode(tmp->key,glist);  curmin=tmp->count;            }            tmp=tmp->next;        } }     }}void FreeGHash(){  int i=0;  for(i=0; i<HASHSIZE; i++)  {     if(hashtab[i]!=NULL)     {    Item * tmp;    Item * deln;    for(tmp=hashtab[i]; tmp!=NULL; tmp=hashtab[i])    {     hashtab[i]=(struct Item *)(tmp->next);     freeItem(tmp);} }     }}void PrintGHash(){  printf("Print Hash:\n");  int i=0;  for(i=0; i<HASHSIZE; i++)  {     if(hashtab[i] !=NULL )     {    printf("%d---",i);    Item * tmp;    for(tmp=hashtab[i]; tmp!=NULL; tmp=tmp->next)    {     printf("%s:%s:%d;",tmp->key,tmp->value,tmp->count);}printf("\n"); }     }}static unsigned int _Hash(char *key){    return _ELFHash(key)%HASHSIZE;}// ELF Hash Functionstatic unsigned int _ELFHash(char *str){       unsigned int hash = 0;       unsigned int x = 0;       while (*str)      {           hash = (hash << 4) + (*str++);//hash左移4位，当前字符ASCII存入hash           if ((x = hash & 0xF0000000L) != 0)           {//如果最高的四位不为0，则说明字符多余7个，如果不处理，再加第九个字符时，第一个字符会被移出，因此要有如下处理。           //该处理，如果对于字符串(a-z 或者A-Z)就会仅仅影响5-8位，否则会影响5-31位，因为C语言使用的算数移位           hash ^= (x >> 24);           //清空28-31位。           hash &= ~x;           }       }      //返回一个符号位为0的数，即丢弃最高位，以免函数外产生影响。(我们可以考虑，如果只有字符，符号位不可能为负)     return (hash & 0x7FFFFFFF);}static void freeItem(Item * item){   free(item->key);   free(item->value);   free(item);   item=NULL;}

?GLish.h

#ifndef __GLIST_H_#define __GLIST_H_typedef struct _Node {    char* data;    struct _Node * next; } Node;typedef struct _List{   int count;    Node * header;} List;void GListInit(List* glist);void GListAddNode(char* data,List* glist);void GListEmptyList(List * glist);int GListGetCount(List* glist);void GListPrint(List * glist);#endif

???GLish.c

#include <stdio.h>#include <stdlib.h>#include "GList.h"#include "4lib.h"//static List glist ;void GListInit(List* glist){  glist->count=0;  glist->header=NULL;}void GListAddNode(char* data,List* glist){ Node * node = (Node *)malloc(sizeof(Node)); //node->data=data; node->data=(char *)strdup(data); node->next=glist->header; glist->header=node; glist->count=glist->count+1;}int GetCount(List* glist){ return glist->count;}void FreeNode(Node * node){  free(node->data);  free(node);  node=NULL;}void GListEmptyList(List * glist){  Node* tmp;  while(glist->header!=NULL)  {        tmp=glist->header;        glist->header=tmp->next;        //free(tmp);        FreeNode(tmp);     }     glist->count=0;     glist->header=NULL;}void GListPrint(List * glist){  printf("GList:\n");  Node* tmp=glist->header;  while(tmp!=NULL)  {        printf("%s ",tmp->data);        tmp=tmp->next;     }     printf("\n");}int GListSearch(char * data,List* glist){ Node* tmp=glist->header;  while(tmp!=NULL)  {    if(strcmp(data,tmp->data)==0)         return TRUE;    else            tmp=tmp->next;     }     return FALSE;}

? 4lib.h

#ifndef __4LIB_H_#define __4LIB_H#define TRUE 1#define FALSE 0char *strdup(const char *str);#endif

4lib.c

char *strdup(const char *str){    int n = strlen(str) + 1;    char *dup = malloc(n);    if(dup)    {        strcpy(dup, str);    }    return dup;}

?? main.c

#include <stdio.h>#include <stdlib.h>#include <string.h>#include "GList.h"#include "GHash.h"#include "4lib.h"#define INPUT "4.input.txt"#define READBUFSIZE 513#define FREADSIZE ((READBUFSIZE-1))#define KEEPBUFSIZE (READBUFSIZE*2)char keepbuf[KEEPBUFSIZE];extern int  HashIncrease(char * key);void HandlerItem(const char *item){ //printf(" %s\n",item); HashIncrease(item);}void HanlderBuf(const char * buf){  if (buf==NULL)      return;  strncat(keepbuf,buf,strlen(buf));  char *split=" \t\n";  char *tmp,*next;  char *first=strtok(keepbuf,split);  //HandlerItem(first,out);  tmp=first;  while(tmp)  {          next=strtok(NULL,split);          if(next==NULL)//end of string          {          strncpy(keepbuf,tmp,strlen(tmp));          keepbuf[strlen(tmp)]='\0';          break;      }      else      {     HandlerItem(tmp);     }          tmp=next;               }}void HandlerItemout(char *item,List * glist){    if(GListSearch(item,glist)==FALSE)  printf(" %s ",item); //HashIncrease(item);}void HanlderBufout(char * buf,List * glist){  if (buf==NULL)      return;  strncat(keepbuf,buf,strlen(buf));  char *split=" \t\n";  char *tmp,*next;  char *first=strtok(keepbuf,split);  //HandlerItem(first,out);  tmp=first;  while(tmp)  {          next=strtok(NULL,split);          if(next==NULL)//end of string          {          strncpy(keepbuf,tmp,strlen(tmp));          keepbuf[strlen(tmp)]='\0';          break;      }      else      {     HandlerItemout(tmp,glist);     }          tmp=next;               }}int main(int argc, char *argv[]){  FILE * f=fopen(INPUT,"r+");  if(NULL==f)  { perror("Open file:");  }    GHashInit();    char readbuf[READBUFSIZE];  memset(keepbuf,'\0',KEEPBUFSIZE);  memset(readbuf,'\0',READBUFSIZE);  int readnum;  while((readnum=fread(readbuf,sizeof(char),FREADSIZE,f))>0)  {      readbuf[readnum]='\0';      //printf("%d\n",readnum);      if(readnum<FREADSIZE)      {      if(feof(f)!=0)//end file      {         HanlderBuf(readbuf);         break;      }          else          {     perror("Error!");     }  }  else if(readnum==FREADSIZE)  {      HanlderBuf(readbuf);      }  }    if(strlen(keepbuf)>0)    HandlerItem(keepbuf);  fclose(f);  PrintGHash();  List glist;  GetMinList(&glist);  printf("list count:%d\n",GetCount(&glist));  GListPrint(&glist);    FreeGHash();    //hanlde the input  f=fopen(INPUT,"r+");  if(NULL==f)  { perror("Open file:");  }  memset(keepbuf,'\0',KEEPBUFSIZE);  memset(readbuf,'\0',READBUFSIZE);  readnum;  while((readnum=fread(readbuf,sizeof(char),FREADSIZE,f))>0)  {      readbuf[readnum]='\0';      //printf("%d\n",readnum);      if(readnum<FREADSIZE)      {      if(feof(f)!=0)//end file      {         HanlderBufout(readbuf,&glist);         break;      }          else          {     perror("Error!");     }  }  else if(readnum==FREADSIZE)  {      HanlderBufout(readbuf,&glist);      }  }    if(strlen(keepbuf)>0)    HandlerItemout(keepbuf,&glist);  fclose(f);    GListEmptyList(&glist);  system("PAUSE");  return 0;}

?? 该解决方案不包含的内容：

???1、在对词汇进行统计的时，未记录词汇的位置，造成在提出低频词汇时需要再次比对，可以提升

?? 2、仅适用于最低频的，无法针对词频范围。如果需要则可以修改为HashTable的冲突列表为顺序列表

?? 3、无法获得每个低频词汇的出现的顺序

?? 4、替换低频词汇效率不高

热点排行

编程

百度之星 2005年 预赛题目四

百度之星 2005年预赛题目四