Linux 磁盘坏道检测和修复查看硬盘坏道代码

对于硬盘的坏道的确没有什么好的办法，ata协议中的smart命令本意上是好的，但是很多的硬盘厂商并没有很好的实现smart的功能；硬盘的相关错误信息会存在相应的扇区里，但是这些扇区相当于一个环形的缓冲区，所以记录的错误信息有限，如果需要完整的硬盘坏道信息，smart就无能为力了。通过完整的读一遍硬盘可以找出坏道的完整信息，但是读牵涉到了DMA的数据传输过程所以效率较低；ata的协议里READ VERIFY (EXT)命令大大提高效率，该命令是NO DATA类型的；以下是具体的代码：

    #include <stdio.h>  
    #include <string.h>  
    #include <unistd.h>  
    #include <stdlib.h>  
    #include <linux/hdreg.h>  
    #include <linux/types.h>  
    #include <sys/stat.h>  
    #include <scsi/sg.h>  
    #include <scsi/scsi.h>  
    #include <sys/ioctl.h>  
    #include <linux/fs.h>  
    #include <fcntl.h>  
    #include <errno.h>  
    #include <sys/time.h>  
      
    #define ATA_DRQ                (1 << 3)    /* data request i/o */  
    #define    ATA_ERR             (1 << 0) /* have an error */  
    #define DRIVER_SENSE             0x08  
    #define CONDITION_GOOD         0x02  
    #define ATA_PASS_THRU_12    12  
    #define ATA_12                        0xa1  
    #define ATA_PASS_THRU_16    16  
    #define ATA_16                        0x85  
    #define CMD_NO_DATA                (3 << 1)  
    #define FOR_SENSE_DATA        0x20  
    #define CMD_VERIFY_EXT        0x42  
    #define CMD_VERIFT                0x40  
      
    #define u64        unsigned long long  
    #define u32        unsigned int  
    #define u8        unsigned char  
      
    struct ata_taskfile {  
      
        u8    hob_feature;      
        u8    hob_nsect;      
        u8    hob_lbal;  
        u8    hob_lbam;  
        u8    hob_lbah;  
        u8    feature;  
        u8    nsect;  
        u8    lbal;  
        u8    lbam;  
        u8    lbah;  
        u8    device;  
        u8    command;      
    };  
      
    static u64 get_disk_size(const char *name)  
    {  
        const char *ptr = name + strlen(name) - 3;  
        u64 size;  
        char buff[128];  
        FILE *fp;  
          
        sprintf(buff,"/sys/block/%s/size",ptr);  
      
        if(NULL == (fp = fopen(buff,"r"))){  
            perror("fopen");  
            return 0;  
        }  
        fscanf(fp,"%lld",&size);  
        fclose(fp);  
          
        return size;  
    }  
      
    static void init_taskfile(struct ata_taskfile *tf,u64 lba,u32 nsect)  
    {  
        memset((void *)tf,0,sizeof(*tf));  
          
        tf->command = CMD_VERIFY_EXT;  
        tf->device = 1 << 6;  
        tf->lbal = lba;  
        tf->lbam = lba >> 8;  
        tf->lbah = lba >> 16;  
        tf->nsect = nsect;  
          
        tf->hob_nsect = nsect >> 8;  
        tf->hob_lbal = lba >> 24;  
        tf->hob_lbam = lba >> 32;  
        tf->hob_lbah = lba >> 40;  
    }  
      
      
    static int sg_ioctl(int fd,struct ata_taskfile *tf)  
    {  
        u8 sense_buffer[32];  
        u8 cdb[ATA_PASS_THRU_16];  
        sg_io_hdr_t sg_io;  
        int err = 0;  
          
        memset(sense_buffer,0,32);  
        memset((void *)&sg_io,0,sizeof(sg_io_hdr_t));  
        memset(cdb,0,ATA_PASS_THRU_16);  
          
        cdb[0] = ATA_16;  
        cdb[1] = CMD_NO_DATA;  
        cdb[2] = FOR_SENSE_DATA;  
        cdb[4] = tf->feature;  
        cdb[6] = tf->nsect;  
        cdb[8] = tf->lbal;  
        cdb[10] = tf->lbam;  
        cdb[12] = tf->lbah;  
        cdb[13] = tf->device;  
        cdb[14] = tf->command;  
              
        cdb[1] |= 1;  
        cdb[3] = tf->hob_feature;  
        cdb[5] = tf->hob_nsect;  
        cdb[7] = tf->hob_lbal;  
        cdb[9] = tf->hob_lbam;  
        cdb[11] = tf->hob_lbah;  
          
        sg_io.cmd_len = ATA_PASS_THRU_16;  
        sg_io.interface_id = ’S’;  
        sg_io.cmdp = cdb;  
        sg_io.mx_sb_len = sizeof(sense_buffer);  
        sg_io.sbp = sense_buffer;  
        sg_io.dxfer_direction = SG_DXFER_NONE;   
        sg_io.timeout = 0;    //sg default time 75s  
      
          
        if((err = ioctl(fd,SG_IO,&sg_io)) == -1)  
        {  
            perror("SG_IO");  
            return err;  
        }  
          
        if(sg_io.host_status || DRIVER_SENSE != sg_io.driver_status  
         || (sg_io.status && CONDITION_GOOD != sg_io.status))  
        {  
                printf("SG_IO: bad response/n");  
                errno = EBADE;  
                return -1;  
        }  
      
        if(0x72 != sense_buffer[0] || sense_buffer[7] < 14   
            || 0x09 != sense_buffer[8] || sense_buffer[9] < 0x0c)   
        {  
            printf("SG_IO:bad sense buffer 0x%x 0x%x 0x%x 0x%x/n",  
                sense_buffer[0],sense_buffer[7],sense_buffer[8],sense_buffer[9]);  
            errno = EBADE;  
            return -1;  
        }  
          
        if(sense_buffer[21] & (ATA_DRQ | ATA_ERR))   
        {  
            printf("I/O error, cmd = 0x%02x status = 0x%02x error = 0x%02x/n",  
                    tf->command, sense_buffer[21], sense_buffer[11]);  
            errno = EIO;  
            return -1;  
        }  
          
        return 0;  
    }  
      
    int ata_ioctl(int fd,u64 lba,u32 nsec)  
    {  
        unsigned char cdb[7];  
        int err = 0;  
          
        cdb[0] = CMD_VERIFT;  
        cdb[1] = 0x00;  
        cdb[2] = nsec;  
        cdb[3] = (lba >> 0) & 0xFF;  
        cdb[4] = (lba >> 8) & 0xFF;  
        cdb[5] = (lba >> 16) & 0xFF;  
        cdb[6] = 0x40 | ((lba >> 24) & 0xFF);  
          
        if (-1 == (err = ioctl(fd, HDIO_DRIVE_TASK, (void *)cdb))) {  
            perror("HDIO_DRIVE_TASK");  
            return -1;  
        }  
          
        return 0;  
    }  
      
    int disk_verify_sectors(const char *name,int fd, u64 start, u32 size)  
    {      
        static u64 capacity = 0;  
        struct ata_taskfile tf;  
          
        if(0 == capacity){  
                capacity = get_disk_size(name);  
        }  
          
        init_taskfile(&tf,start,size);  
          
        return sg_ioctl(fd,&tf);  
    }  
      
    int main(int argc, char *argv[])  
    {  
        u64 offset = 0;  
        int fd;  
        u64 capacity;  
        struct timeval t1,t2;  
        int size;  
          
        if (argc < 3) {  
            printf("Usage: ./exec devname size/n");  
            printf("@devname:device name/n"  
                         "@size:per verifying sectors/n");  
            return 0;  
        }  
          
        capacity = get_disk_size(argv[1]);  
        printf("disk capacity = %lld/n",capacity);  
        size = atoi(argv[2]);  
        printf("verfy disk granularity %d/n",size);  
          
        if(-1 == (fd = open(argv[1],O_RDWR))){  
            perror("open");  
            return fd;  
        }  
          
        gettimeofday(&t1,NULL);  
        while(1) {  
            if((offset + size) > capacity){  
                size = offset - capacity;  
                capacity = 0;  
            }  
            if (disk_verify_sectors(argv[1],fd,offset, size) < 0) {  
                printf("%s:sectors between %Lu - %Lu error/n",  
      
                       argv[1],offset, offset+size);  
            }  
            offset += size;          
            if(0 == capacity)  
                break;  
        }  
          
        gettimeofday(&t2,NULL);  
        printf("%s:verify over/n",argv[1]);  
        printf("kill time = %ld s/n",(t2.tv_sec - t1.tv_sec));  
        close(fd);  
          
        return 0;  
    }

今天在实验室碰到一台机器，根分区和/upgrade分区变成了read-only system.当碰到这个问题的时候，我的第一反应很可能硬件出现了故障，我使用了如下的方法来检测和排除故障：

使用dmesg检查是否有关于磁盘错误的信息，果不其然发现大量磁盘扇区错误的信息，如下： end_request: I/O error, dev 03:06 (sda), sector
从上面的输出我们判定扇区可能出现了问题，使用badblocks -s -v -o /root/badblocks.log /dev/sda 检查硬盘是否产生坏道并输出到badblocks.log中。log大约是这样的模
式：64039224 64039240 64039241 64039242 64039243 64039256 64039257 64039258 64039259 687056
可修复修复坏道修复方法：1 使用 badblocks -s -w /dev/sda END START END代表需要修复的扇区末端，START代表需要修复的扇区起始端。2 使用fsck -a /dev/sda1

磁盘坏道分为三种：0磁道坏道，逻辑坏道，硬盘坏道。其中逻辑坏道可以使用上面的方法修复，0磁道坏道的修复方法是隔离0磁道，使用fdsk划分区的时候从1磁道开始划分区。如果是硬盘坏道的话，只能隔离不能修复。硬盘坏道的监测方法：使用上述方法检测修复后，再使用badblocks -s -v -o /root/badblocks.log /dev/sda监
测看是否还有坏道存在，如果坏道还是存在的话说明坏道属于硬盘坏道。硬盘坏道隔离方法，首先记录监测出的硬盘坏道，然后分区的时候把硬盘坏道所在的扇区分
在一个分区（大小一般大于坏扇区大小），划分出的坏道分区不使用即可达到隔离的目的。隔离只是暂时方案，建议尽快更换硬盘，因为坏道会扩散，以免以后出现
严重的数据问题。

http://blog.csdn.net/wesleyluo/article/details/6115223

http://blog.chinaunix.net/uid-20786165-id-3170784.html