对于硬盘的坏道的确没有什么好的办法,ata协议中的smart命令本意上是好的,但是很多的硬盘厂商并没有很好的实现smart的功能;硬盘的相关错误信息会存在相应的扇区里,但是这些扇区相当于一个环形的缓冲区,所以记录的错误信息有限,如果需要完整的硬盘坏道信息,smart就无能为力了。通过完整的读一遍硬盘可以找出坏道的完整信息,但是读牵涉到了DMA的数据传输过程所以效率较低;ata的协议里READ VERIFY (EXT)命令大大提高效率,该命令是NO DATA类型的;以下是具体的代码:
#include <stdio.h> #include <string.h> #include <unistd.h> #include <stdlib.h> #include <linux/hdreg.h> #include <linux/types.h> #include <sys/stat.h> #include <scsi/sg.h> #include <scsi/scsi.h> #include <sys/ioctl.h> #include <linux/fs.h> #include <fcntl.h> #include <errno.h> #include <sys/time.h> #define ATA_DRQ (1 << 3) /* data request i/o */ #define ATA_ERR (1 << 0) /* have an error */ #define DRIVER_SENSE 0x08 #define CONDITION_GOOD 0x02 #define ATA_PASS_THRU_12 12 #define ATA_12 0xa1 #define ATA_PASS_THRU_16 16 #define ATA_16 0x85 #define CMD_NO_DATA (3 << 1) #define FOR_SENSE_DATA 0x20 #define CMD_VERIFY_EXT 0x42 #define CMD_VERIFT 0x40 #define u64 unsigned long long #define u32 unsigned int #define u8 unsigned char struct ata_taskfile { u8 hob_feature; u8 hob_nsect; u8 hob_lbal; u8 hob_lbam; u8 hob_lbah; u8 feature; u8 nsect; u8 lbal; u8 lbam; u8 lbah; u8 device; u8 command; }; static u64 get_disk_size(const char *name) { const char *ptr = name + strlen(name) - 3; u64 size; char buff[128]; FILE *fp; sprintf(buff,"/sys/block/%s/size",ptr); if(NULL == (fp = fopen(buff,"r"))){ perror("fopen"); return 0; } fscanf(fp,"%lld",&size); fclose(fp); return size; } static void init_taskfile(struct ata_taskfile *tf,u64 lba,u32 nsect) { memset((void *)tf,0,sizeof(*tf)); tf->command = CMD_VERIFY_EXT; tf->device = 1 << 6; tf->lbal = lba; tf->lbam = lba >> 8; tf->lbah = lba >> 16; tf->nsect = nsect; tf->hob_nsect = nsect >> 8; tf->hob_lbal = lba >> 24; tf->hob_lbam = lba >> 32; tf->hob_lbah = lba >> 40; } static int sg_ioctl(int fd,struct ata_taskfile *tf) { u8 sense_buffer[32]; u8 cdb[ATA_PASS_THRU_16]; sg_io_hdr_t sg_io; int err = 0; memset(sense_buffer,0,32); memset((void *)&sg_io,0,sizeof(sg_io_hdr_t)); memset(cdb,0,ATA_PASS_THRU_16); cdb[0] = ATA_16; cdb[1] = CMD_NO_DATA; cdb[2] = FOR_SENSE_DATA; cdb[4] = tf->feature; cdb[6] = tf->nsect; cdb[8] = tf->lbal; cdb[10] = tf->lbam; cdb[12] = tf->lbah; cdb[13] = tf->device; cdb[14] = tf->command; cdb[1] |= 1; cdb[3] = tf->hob_feature; cdb[5] = tf->hob_nsect; cdb[7] = tf->hob_lbal; cdb[9] = tf->hob_lbam; cdb[11] = tf->hob_lbah; sg_io.cmd_len = ATA_PASS_THRU_16; sg_io.interface_id = ’S’; sg_io.cmdp = cdb; sg_io.mx_sb_len = sizeof(sense_buffer); sg_io.sbp = sense_buffer; sg_io.dxfer_direction = SG_DXFER_NONE; sg_io.timeout = 0; //sg default time 75s if((err = ioctl(fd,SG_IO,&sg_io)) == -1) { perror("SG_IO"); return err; } if(sg_io.host_status || DRIVER_SENSE != sg_io.driver_status || (sg_io.status && CONDITION_GOOD != sg_io.status)) { printf("SG_IO: bad response/n"); errno = EBADE; return -1; } if(0x72 != sense_buffer[0] || sense_buffer[7] < 14 || 0x09 != sense_buffer[8] || sense_buffer[9] < 0x0c) { printf("SG_IO:bad sense buffer 0x%x 0x%x 0x%x 0x%x/n", sense_buffer[0],sense_buffer[7],sense_buffer[8],sense_buffer[9]); errno = EBADE; return -1; } if(sense_buffer[21] & (ATA_DRQ | ATA_ERR)) { printf("I/O error, cmd = 0x%02x status = 0x%02x error = 0x%02x/n", tf->command, sense_buffer[21], sense_buffer[11]); errno = EIO; return -1; } return 0; } int ata_ioctl(int fd,u64 lba,u32 nsec) { unsigned char cdb[7]; int err = 0; cdb[0] = CMD_VERIFT; cdb[1] = 0x00; cdb[2] = nsec; cdb[3] = (lba >> 0) & 0xFF; cdb[4] = (lba >> 8) & 0xFF; cdb[5] = (lba >> 16) & 0xFF; cdb[6] = 0x40 | ((lba >> 24) & 0xFF); if (-1 == (err = ioctl(fd, HDIO_DRIVE_TASK, (void *)cdb))) { perror("HDIO_DRIVE_TASK"); return -1; } return 0; } int disk_verify_sectors(const char *name,int fd, u64 start, u32 size) { static u64 capacity = 0; struct ata_taskfile tf; if(0 == capacity){ capacity = get_disk_size(name); } init_taskfile(&tf,start,size); return sg_ioctl(fd,&tf); } int main(int argc, char *argv[]) { u64 offset = 0; int fd; u64 capacity; struct timeval t1,t2; int size; if (argc < 3) { printf("Usage: ./exec devname size/n"); printf("@devname:device name/n" "@size:per verifying sectors/n"); return 0; } capacity = get_disk_size(argv[1]); printf("disk capacity = %lld/n",capacity); size = atoi(argv[2]); printf("verfy disk granularity %d/n",size); if(-1 == (fd = open(argv[1],O_RDWR))){ perror("open"); return fd; } gettimeofday(&t1,NULL); while(1) { if((offset + size) > capacity){ size = offset - capacity; capacity = 0; } if (disk_verify_sectors(argv[1],fd,offset, size) < 0) { printf("%s:sectors between %Lu - %Lu error/n", argv[1],offset, offset+size); } offset += size; if(0 == capacity) break; } gettimeofday(&t2,NULL); printf("%s:verify over/n",argv[1]); printf("kill time = %ld s/n",(t2.tv_sec - t1.tv_sec)); close(fd); return 0; }
今天在实验室碰到一台机器,根分区和/upgrade分区变成了read-only system.当碰到这个问题的时候,我的第一反应很可能硬件出现了故障,我使用了如下的方法来检测和排除故障:
- 使用dmesg检查是否有关于磁盘错误的信息,果不其然发现大量磁盘扇区错误的信息,如下: end_request: I/O error, dev 03:06 (sda), sector
-
从上面的输出我们判定扇区可能出现了问题,使用badblocks -s -v -o /root/badblocks.log /dev/sda 检查硬盘是否产生坏道并输出到badblocks.log中。log大约是这样的模
式:64039224 64039240 64039241 64039242 64039243 64039256 64039257 64039258 64039259 687056 - 可修复修复坏道修复方法:1 使用 badblocks -s -w /dev/sda END START END代表需要修复的扇区末端,START代表需要修复的扇区起始端。2 使用fsck -a /dev/sda1
磁盘坏道分为三种:0磁道坏道,逻辑坏道,硬盘坏道。其中逻辑坏道可以使用上面的方法修复,0磁道坏道的修复方法是隔离0磁道,使用fdsk划分区的时候从1磁道开始划分区。如果是硬盘坏道的话,只能隔离不能修复。硬盘坏道的监测方法:使用上述方法检测修复后,再使用badblocks -s -v -o /root/badblocks.log /dev/sda监
测看是否还有坏道存在,如果坏道还是存在的话说明坏道属于硬盘坏道。硬盘坏道隔离方法,首先记录监测出的硬盘坏道,然后分区的时候把硬盘坏道所在的扇区分
在一个分区(大小一般大于坏扇区大小),划分出的坏道分区不使用即可达到隔离的目的。隔离只是暂时方案,建议尽快更换硬盘,因为坏道会扩散,以免以后出现
严重的数据问题。
测看是否还有坏道存在,如果坏道还是存在的话说明坏道属于硬盘坏道。硬盘坏道隔离方法,首先记录监测出的硬盘坏道,然后分区的时候把硬盘坏道所在的扇区分
在一个分区(大小一般大于坏扇区大小),划分出的坏道分区不使用即可达到隔离的目的。隔离只是暂时方案,建议尽快更换硬盘,因为坏道会扩散,以免以后出现
严重的数据问题。
http://blog.csdn.net/wesleyluo/article/details/6115223
http://blog.chinaunix.net/uid-20786165-id-3170784.html