首页 \ 问答 \ 有没有一个真正有效的例子可以展示x86_64上ILP(指令级并行性)的好处?(Is there a really working example which showing the benefits of ILP(Instruction-Level Parallelism) on x86_64?)

有没有一个真正有效的例子可以展示x86_64上ILP(指令级并行性)的好处?(Is there a really working example which showing the benefits of ILP(Instruction-Level Parallelism) on x86_64?)

由于已知的CPU是流水线,并且如果命令的顺序彼此独立 - 这被称为ILP(指令级并行性): http : //en.wikipedia.org/wiki/Instruction-level_parallelism

但是有没有一个真正有效的例子,它显示了CPU x86_64(但是在这两种情况下的相同数量的cmp / jne )ILP的好处,至少是syntetic的例子?

我将编写以下示例 - 将数组的所有元素相加,但不会显示ILP的任何优势: http : //ideone.com/fork/poWfsm

  • 顺序:
        for(i = 0; i < arr_size; i += 8) {
            result += arr[i+0] + arr[i+1] + 
                    arr[i+2] + arr[i+3] + 
                    arr[i+4] + arr[i+5] +
                    arr[i+6] + arr[i+7];
        }
  • ILP:
        register unsigned int v0, v1, v2, v3;
        v0 = v1 = v2 = v3 = 0;
        for(i = 0; i < arr_size; i += 8) {              
            v0 += arr[i+0] + arr[i+1];
            v1 += arr[i+2] + arr[i+3];
            v2 += arr[i+4] + arr[i+5];
            v3 += arr[i+6] + arr[i+7];
        }
        result = v0+v1+v2+v3;

结果:

seq:0.100000秒,res:1000000000,ipl:0.110000秒,更快0.909091 ×,res:1000000000

seq:0.100000秒,res:1000000000,ipl:0.100000sec,更快1.000000X ,res:1000000000

seq:0.100000秒,res:1000000000,ipl:0.110000秒,更快0.909091 ×,res:1000000000

seq:0.100000秒,res:1000000000,ipl:0.100000sec,更快1.000000X ,res:1000000000

seq:0.110000秒,res:1000000000,ipl:0.110000秒,更快1.000000 X,res:1000000000

seq:0.100000秒,res:1000000000,ipl:0.110000秒,更快0.909091 ×,res:1000000000

seq:0.100000秒,res:1000000000,ipl:0.110000秒,更快0.909091 ×,res:1000000000

seq:0.110000秒,res:1000000000,ipl:0.100000秒,更快1.100000 X,res:1000000000

seq:0.110000秒,res:1000000000,ipl:0.100000秒,更快1.100000 X,res:1000000000

seq:0.110000秒,res:1000000000,ipl:0.120000秒,更快0.916667 ×,res:1000000000

更快的AVG: 0.975303

ILP甚至比Sequential慢一点。

C代码: http//ideone.com/fork/poWfsm

#include <time.h>
#include <stdio.h>
#include <stdlib.h>

int main() {
    // create and init array
    const size_t arr_size = 100000000;
    unsigned int *arr = (unsigned int*) malloc(arr_size * sizeof(unsigned int));
    size_t i, k;
    for(i = 0; i < arr_size; ++i)
        arr[i] = 10;

    unsigned int result = 0;
    clock_t start, end;
    const int c_iterations = 10;    // iterations of experiment
    float faster_avg = 0;
    // -----------------------------------------------------------------


    for(k = 0; k < c_iterations; ++k) {
        result = 0; 

        // Sequential
        start = clock();

        for(i = 0; i < arr_size; i += 8) {
            result += arr[i+0] + arr[i+1] + 
                    arr[i+2] + arr[i+3] + 
                    arr[i+4] + arr[i+5] +
                    arr[i+6] + arr[i+7];
        }

        end = clock();
        const float c_time_seq = (float)(end - start)/CLOCKS_PER_SEC;   
        printf("seq: %f sec, res: %u, ", c_time_seq, result);
        // -----------------------------------------------------------------

        result = 0;

        // IPL-optimization
        start = clock();

        register unsigned int v0, v1, v2, v3;
        v0 = v1 = v2 = v3 = 0;

        for(i = 0; i < arr_size; i += 8) {

            v0 += arr[i+0] + arr[i+1];
            v1 += arr[i+2] + arr[i+3];
            v2 += arr[i+4] + arr[i+5];
            v3 += arr[i+6] + arr[i+7];


        }
        result = v0+v1+v2+v3;


        end = clock();
        const float c_time_ipl = (float)(end - start)/CLOCKS_PER_SEC;
        const float c_faster = c_time_seq/c_time_ipl;

        printf("ipl: %f sec, faster %f X, res: %u \n", c_time_ipl, c_faster, result);           
        faster_avg += c_faster;
    }

    faster_avg = faster_avg/c_iterations;
    printf("faster AVG: %f \n", faster_avg);

    return 0;
}

更新:

  • Sequential(反汇编程序MS Visual Studio 2013)
    for (i = 0; i < arr_size; i += 8) {
        result += arr[i + 0] + arr[i + 1] +
            arr[i + 2] + arr[i + 3] +
            arr[i + 4] + arr[i + 5] +
            arr[i + 6] + arr[i + 7];
    }

000000013F131080  mov         ecx,dword ptr [rdx-18h]  
000000013F131083  lea         rdx,[rdx+20h]  
000000013F131087  add         ecx,dword ptr [rdx-34h]  
000000013F13108A  add         ecx,dword ptr [rdx-30h]  
000000013F13108D  add         ecx,dword ptr [rdx-2Ch]  
000000013F131090  add         ecx,dword ptr [rdx-28h]  
000000013F131093  add         ecx,dword ptr [rdx-24h]  
000000013F131096  add         ecx,dword ptr [rdx-1Ch]  
000000013F131099  add         ecx,dword ptr [rdx-20h]  
000000013F13109C  add         edi,ecx  
000000013F13109E  dec         r8  
000000013F1310A1  jne         main+80h (013F131080h)  
  • ILP(反汇编MS Visual Studio 2013)
    for (i = 0; i < arr_size; i += 8) {
        v0 += arr[i + 0] + arr[i + 1];
000000013F1310F0  mov         ecx,dword ptr [rdx-0Ch]  
        v1 += arr[i + 2] + arr[i + 3];
        v2 += arr[i + 4] + arr[i + 5];
000000013F1310F3  mov         eax,dword ptr [rdx+8]  
000000013F1310F6  lea         rdx,[rdx+20h]  
000000013F1310FA  add         ecx,dword ptr [rdx-28h]  
000000013F1310FD  add         eax,dword ptr [rdx-1Ch]  
000000013F131100  add         ebp,ecx  
000000013F131102  mov         ecx,dword ptr [rdx-24h]  
000000013F131105  add         ebx,eax  
000000013F131107  add         ecx,dword ptr [rdx-20h]  
        v3 += arr[i + 6] + arr[i + 7];
000000013F13110A  mov         eax,dword ptr [rdx-10h]  
        v3 += arr[i + 6] + arr[i + 7];
000000013F13110D  add         eax,dword ptr [rdx-14h]  
000000013F131110  add         esi,ecx  
000000013F131112  add         edi,eax  
000000013F131114  dec         r8  
000000013F131117  jne         main+0F0h (013F1310F0h) 
    }
    result = v0 + v1 + v2 + v3;

编译器命令行:

/GS /GL /W3 /Gy /Zc:wchar_t /Zi /Gm- /O2 /Ob2 /sdl /Fd"x64\Release\vc120.pdb" /fp:precise /D "_MBCS" /errorReport:prompt /WX- /Zc:forScope /Gd /Oi /MT /Fa"x64\Release\" /EHsc /nologo /Fo"x64\Release\" /Ot /Fp"x64\Release\IPL_reduce_test.pch" 

答案的附加注释:

这个简单的例子展示了在50000000双元素的数组中,展开循环和展开循环+ ILP之间ILP的好处: http : //ideone.com/LgTP6b

更快的AVG:1.152778

  • False-Sequential可以通过CPU管道进行优化(反汇编器MS Visual Studio 2013) - 在每次迭代中添加8个元素使用临时寄存器xmm0 ,然后添加到结果xmm6 ,即可以使用寄存器重命名
result += arr[i + 0] + arr[i + 1] + arr[i + 2] + arr[i + 3] +
    arr[i + 4] + arr[i + 5] + arr[i + 6] + arr[i + 7];
000000013FBA1090  movsd       xmm0,mmword ptr [rcx-10h]  
000000013FBA1095  add         rcx,40h  
000000013FBA1099  addsd       xmm0,mmword ptr [rcx-48h]  
000000013FBA109E  addsd       xmm0,mmword ptr [rcx-40h]  
000000013FBA10A3  addsd       xmm0,mmword ptr [rcx-38h]  
000000013FBA10A8  addsd       xmm0,mmword ptr [rcx-30h]  
000000013FBA10AD  addsd       xmm0,mmword ptr [rcx-28h]  
000000013FBA10B2  addsd       xmm0,mmword ptr [rcx-20h]  
000000013FBA10B7  addsd       xmm0,mmword ptr [rcx-18h]  
000000013FBA10BC  addsd       xmm6,xmm0  
000000013FBA10C0  dec         rdx  
000000013FBA10C3  jne         main+90h (013FBA1090h) 
  • 无法通过CPU管道优化的True-Sequential (反汇编器MS Visual Studio 2013) - 在每次迭代中添加8个元素使用结果寄存器xmm6 ,即不能使用寄存器重命名
            result += arr[i + 0];
000000013FFC1090  addsd       xmm6,mmword ptr [rcx-10h]  
000000013FFC1095  add         rcx,40h  
            result += arr[i + 1];
000000013FFC1099  addsd       xmm6,mmword ptr [rcx-48h]  
            result += arr[i + 2];
000000013FFC109E  addsd       xmm6,mmword ptr [rcx-40h]  
            result += arr[i + 3];
000000013FFC10A3  addsd       xmm6,mmword ptr [rcx-38h]  
            result += arr[i + 4];
000000013FFC10A8  addsd       xmm6,mmword ptr [rcx-30h]  
            result += arr[i + 5];
000000013FFC10AD  addsd       xmm6,mmword ptr [rcx-28h]  
            result += arr[i + 6];
000000013FFC10B2  addsd       xmm6,mmword ptr [rcx-20h]  
            result += arr[i + 7];
000000013FFC10B7  addsd       xmm6,mmword ptr [rcx-18h]  
000000013FFC10BC  dec         rdx  
000000013FFC10BF  jne         main+90h (013FFC1090h) 

As known CPU is pipeline, and it works most efficiently if the sequence of commands independent from each other - this known as ILP (Instruction-Level Parallelism): http://en.wikipedia.org/wiki/Instruction-level_parallelism

But is there a really working example which showing the benefits of ILP, at least syntetic example, for CPU x86_64 (but for the same amount of cmp/jne in both cases)?

I will write the following example - add up all the elements of the array, but it does not show any advantages of ILP: http://ideone.com/fork/poWfsm

  • Sequential:
        for(i = 0; i < arr_size; i += 8) {
            result += arr[i+0] + arr[i+1] + 
                    arr[i+2] + arr[i+3] + 
                    arr[i+4] + arr[i+5] +
                    arr[i+6] + arr[i+7];
        }
  • ILP:
        register unsigned int v0, v1, v2, v3;
        v0 = v1 = v2 = v3 = 0;
        for(i = 0; i < arr_size; i += 8) {              
            v0 += arr[i+0] + arr[i+1];
            v1 += arr[i+2] + arr[i+3];
            v2 += arr[i+4] + arr[i+5];
            v3 += arr[i+6] + arr[i+7];
        }
        result = v0+v1+v2+v3;

Result:

seq: 0.100000 sec, res: 1000000000, ipl: 0.110000 sec, faster 0.909091 X, res: 1000000000

seq: 0.100000 sec, res: 1000000000, ipl: 0.100000 sec, faster 1.000000 X, res: 1000000000

seq: 0.100000 sec, res: 1000000000, ipl: 0.110000 sec, faster 0.909091 X, res: 1000000000

seq: 0.100000 sec, res: 1000000000, ipl: 0.100000 sec, faster 1.000000 X, res: 1000000000

seq: 0.110000 sec, res: 1000000000, ipl: 0.110000 sec, faster 1.000000 X, res: 1000000000

seq: 0.100000 sec, res: 1000000000, ipl: 0.110000 sec, faster 0.909091 X, res: 1000000000

seq: 0.100000 sec, res: 1000000000, ipl: 0.110000 sec, faster 0.909091 X, res: 1000000000

seq: 0.110000 sec, res: 1000000000, ipl: 0.100000 sec, faster 1.100000 X, res: 1000000000

seq: 0.110000 sec, res: 1000000000, ipl: 0.100000 sec, faster 1.100000 X, res: 1000000000

seq: 0.110000 sec, res: 1000000000, ipl: 0.120000 sec, faster 0.916667 X, res: 1000000000

faster AVG: 0.975303

ILP even a little slower than Sequential.

C-code: http://ideone.com/fork/poWfsm

#include <time.h>
#include <stdio.h>
#include <stdlib.h>

int main() {
    // create and init array
    const size_t arr_size = 100000000;
    unsigned int *arr = (unsigned int*) malloc(arr_size * sizeof(unsigned int));
    size_t i, k;
    for(i = 0; i < arr_size; ++i)
        arr[i] = 10;

    unsigned int result = 0;
    clock_t start, end;
    const int c_iterations = 10;    // iterations of experiment
    float faster_avg = 0;
    // -----------------------------------------------------------------


    for(k = 0; k < c_iterations; ++k) {
        result = 0; 

        // Sequential
        start = clock();

        for(i = 0; i < arr_size; i += 8) {
            result += arr[i+0] + arr[i+1] + 
                    arr[i+2] + arr[i+3] + 
                    arr[i+4] + arr[i+5] +
                    arr[i+6] + arr[i+7];
        }

        end = clock();
        const float c_time_seq = (float)(end - start)/CLOCKS_PER_SEC;   
        printf("seq: %f sec, res: %u, ", c_time_seq, result);
        // -----------------------------------------------------------------

        result = 0;

        // IPL-optimization
        start = clock();

        register unsigned int v0, v1, v2, v3;
        v0 = v1 = v2 = v3 = 0;

        for(i = 0; i < arr_size; i += 8) {

            v0 += arr[i+0] + arr[i+1];
            v1 += arr[i+2] + arr[i+3];
            v2 += arr[i+4] + arr[i+5];
            v3 += arr[i+6] + arr[i+7];


        }
        result = v0+v1+v2+v3;


        end = clock();
        const float c_time_ipl = (float)(end - start)/CLOCKS_PER_SEC;
        const float c_faster = c_time_seq/c_time_ipl;

        printf("ipl: %f sec, faster %f X, res: %u \n", c_time_ipl, c_faster, result);           
        faster_avg += c_faster;
    }

    faster_avg = faster_avg/c_iterations;
    printf("faster AVG: %f \n", faster_avg);

    return 0;
}

UPDATE:

  • Sequential (Disassembler MS Visual Studio 2013):
    for (i = 0; i < arr_size; i += 8) {
        result += arr[i + 0] + arr[i + 1] +
            arr[i + 2] + arr[i + 3] +
            arr[i + 4] + arr[i + 5] +
            arr[i + 6] + arr[i + 7];
    }

000000013F131080  mov         ecx,dword ptr [rdx-18h]  
000000013F131083  lea         rdx,[rdx+20h]  
000000013F131087  add         ecx,dword ptr [rdx-34h]  
000000013F13108A  add         ecx,dword ptr [rdx-30h]  
000000013F13108D  add         ecx,dword ptr [rdx-2Ch]  
000000013F131090  add         ecx,dword ptr [rdx-28h]  
000000013F131093  add         ecx,dword ptr [rdx-24h]  
000000013F131096  add         ecx,dword ptr [rdx-1Ch]  
000000013F131099  add         ecx,dword ptr [rdx-20h]  
000000013F13109C  add         edi,ecx  
000000013F13109E  dec         r8  
000000013F1310A1  jne         main+80h (013F131080h)  
  • ILP (Disassembler MS Visual Studio 2013):
    for (i = 0; i < arr_size; i += 8) {
        v0 += arr[i + 0] + arr[i + 1];
000000013F1310F0  mov         ecx,dword ptr [rdx-0Ch]  
        v1 += arr[i + 2] + arr[i + 3];
        v2 += arr[i + 4] + arr[i + 5];
000000013F1310F3  mov         eax,dword ptr [rdx+8]  
000000013F1310F6  lea         rdx,[rdx+20h]  
000000013F1310FA  add         ecx,dword ptr [rdx-28h]  
000000013F1310FD  add         eax,dword ptr [rdx-1Ch]  
000000013F131100  add         ebp,ecx  
000000013F131102  mov         ecx,dword ptr [rdx-24h]  
000000013F131105  add         ebx,eax  
000000013F131107  add         ecx,dword ptr [rdx-20h]  
        v3 += arr[i + 6] + arr[i + 7];
000000013F13110A  mov         eax,dword ptr [rdx-10h]  
        v3 += arr[i + 6] + arr[i + 7];
000000013F13110D  add         eax,dword ptr [rdx-14h]  
000000013F131110  add         esi,ecx  
000000013F131112  add         edi,eax  
000000013F131114  dec         r8  
000000013F131117  jne         main+0F0h (013F1310F0h) 
    }
    result = v0 + v1 + v2 + v3;

Compiler command line:

/GS /GL /W3 /Gy /Zc:wchar_t /Zi /Gm- /O2 /Ob2 /sdl /Fd"x64\Release\vc120.pdb" /fp:precise /D "_MBCS" /errorReport:prompt /WX- /Zc:forScope /Gd /Oi /MT /Fa"x64\Release\" /EHsc /nologo /Fo"x64\Release\" /Ot /Fp"x64\Release\IPL_reduce_test.pch" 

Additional Notes to the answer:

The simple example which showing the benefits of ILP between Unroll-loop and Unroll-loop+ILP for array of 50000000 double elements: http://ideone.com/LgTP6b

faster AVG: 1.152778

  • False-Sequential which can be optimized by CPU-pipeline (Disassembler MS Visual Studio 2013) - for add 8 elements in each iteration uses temporary register xmm0 which then adds to the result xmm6, i.e. can be used Register renaming:
result += arr[i + 0] + arr[i + 1] + arr[i + 2] + arr[i + 3] +
    arr[i + 4] + arr[i + 5] + arr[i + 6] + arr[i + 7];
000000013FBA1090  movsd       xmm0,mmword ptr [rcx-10h]  
000000013FBA1095  add         rcx,40h  
000000013FBA1099  addsd       xmm0,mmword ptr [rcx-48h]  
000000013FBA109E  addsd       xmm0,mmword ptr [rcx-40h]  
000000013FBA10A3  addsd       xmm0,mmword ptr [rcx-38h]  
000000013FBA10A8  addsd       xmm0,mmword ptr [rcx-30h]  
000000013FBA10AD  addsd       xmm0,mmword ptr [rcx-28h]  
000000013FBA10B2  addsd       xmm0,mmword ptr [rcx-20h]  
000000013FBA10B7  addsd       xmm0,mmword ptr [rcx-18h]  
000000013FBA10BC  addsd       xmm6,xmm0  
000000013FBA10C0  dec         rdx  
000000013FBA10C3  jne         main+90h (013FBA1090h) 
  • True-Sequential which can not be optimized by CPU-pipeline (Disassembler MS Visual Studio 2013) - for add 8 elements in each iteration uses the result register xmm6, i.e. can not be used Register renaming:
            result += arr[i + 0];
000000013FFC1090  addsd       xmm6,mmword ptr [rcx-10h]  
000000013FFC1095  add         rcx,40h  
            result += arr[i + 1];
000000013FFC1099  addsd       xmm6,mmword ptr [rcx-48h]  
            result += arr[i + 2];
000000013FFC109E  addsd       xmm6,mmword ptr [rcx-40h]  
            result += arr[i + 3];
000000013FFC10A3  addsd       xmm6,mmword ptr [rcx-38h]  
            result += arr[i + 4];
000000013FFC10A8  addsd       xmm6,mmword ptr [rcx-30h]  
            result += arr[i + 5];
000000013FFC10AD  addsd       xmm6,mmword ptr [rcx-28h]  
            result += arr[i + 6];
000000013FFC10B2  addsd       xmm6,mmword ptr [rcx-20h]  
            result += arr[i + 7];
000000013FFC10B7  addsd       xmm6,mmword ptr [rcx-18h]  
000000013FFC10BC  dec         rdx  
000000013FFC10BF  jne         main+90h (013FFC1090h) 

原文:https://stackoverflow.com/questions/27748020
更新时间:2023-05-25 18:05

最满意答案

浏览器在单独的请求中加载iframe内容。 你必须这样做:

for iframe in iframexx:
    response = urllib2.urlopen(iframe.attrs['src'])
    iframe_soup = BeautifulSoup(response)

记住:BeautifulSoup不是浏览器; 它不会为你获取图像,CSS和JavaScript资源。


Browsers load the iframe content in a separate request. You'll have to do the same:

for iframe in iframexx:
    response = urllib2.urlopen(iframe.attrs['src'])
    iframe_soup = BeautifulSoup(response)

Remember: BeautifulSoup is not a browser; it won't fetch images, CSS and JavaScript resources for you either.

相关问答

更多

最新问答

更多
  • 获取MVC 4使用的DisplayMode后缀(Get the DisplayMode Suffix being used by MVC 4)
  • 如何通过引用返回对象?(How is returning an object by reference possible?)
  • 矩阵如何存储在内存中?(How are matrices stored in memory?)
  • 每个请求的Java新会话?(Java New Session For Each Request?)
  • css:浮动div中重叠的标题h1(css: overlapping headlines h1 in floated divs)
  • 无论图像如何,Caffe预测同一类(Caffe predicts same class regardless of image)
  • xcode语法颜色编码解释?(xcode syntax color coding explained?)
  • 在Access 2010 Runtime中使用Office 2000校对工具(Use Office 2000 proofing tools in Access 2010 Runtime)
  • 从单独的Web主机将图像传输到服务器上(Getting images onto server from separate web host)
  • 从旧版本复制文件并保留它们(旧/新版本)(Copy a file from old revision and keep both of them (old / new revision))
  • 西安哪有PLC可控制编程的培训
  • 在Entity Framework中选择基类(Select base class in Entity Framework)
  • 在Android中出现错误“数据集和渲染器应该不为null,并且应该具有相同数量的系列”(Error “Dataset and renderer should be not null and should have the same number of series” in Android)
  • 电脑二级VF有什么用
  • Datamapper Ruby如何添加Hook方法(Datamapper Ruby How to add Hook Method)
  • 金华英语角.
  • 手机软件如何制作
  • 用于Android webview中图像保存的上下文菜单(Context Menu for Image Saving in an Android webview)
  • 注意:未定义的偏移量:PHP(Notice: Undefined offset: PHP)
  • 如何读R中的大数据集[复制](How to read large dataset in R [duplicate])
  • Unity 5 Heighmap与地形宽度/地形长度的分辨率关系?(Unity 5 Heighmap Resolution relationship to terrain width / terrain length?)
  • 如何通知PipedOutputStream线程写入最后一个字节的PipedInputStream线程?(How to notify PipedInputStream thread that PipedOutputStream thread has written last byte?)
  • python的访问器方法有哪些
  • DeviceNetworkInformation:哪个是哪个?(DeviceNetworkInformation: Which is which?)
  • 在Ruby中对组合进行排序(Sorting a combination in Ruby)
  • 网站开发的流程?
  • 使用Zend Framework 2中的JOIN sql检索数据(Retrieve data using JOIN sql in Zend Framework 2)
  • 条带格式类型格式模式编号无法正常工作(Stripes format type format pattern number not working properly)
  • 透明度错误IE11(Transparency bug IE11)
  • linux的基本操作命令。。。