因为对于dxva2解码获得的数据不宜copy回内存给CPU处理,因此最好的办法是在GPU上直接进行处理。D3D的像素着色器可以对像素直接进行操做,实现点运算极其简单方便,简单的卷积运算效果也很是好。但D3D9的限制也不少,对于过于复杂的图像处理则显得有些不能胜任。html
点运算用HLSL很是容易实现,几乎是公式怎么写,代码就怎么写。以RGB转灰度图显示为例:spa
texture Tex0 ; int iFlag = 0 ; float aValue= 0.0 ; float bValue= 0.0 ; sampler2D YTex = sampler_state { Texture = <Tex0> ; MipFilter = LINEAR ; MinFilter = LINEAR ; MagFilter = LINEAR ; AddressU = CLAMP ; AddressV = CLAMP ; }; struct PS_INPUT { float2 uvCoords0 : TEXCOORD0 ; }; float4 Main( PS_INPUT input ) : COLOR0 { float4 yuvColor ; //rgb to gray 不知道是否是这么显示的,姑且这么认为 float gray = tex2D( YTex, input.uvCoords0 ).r * 0.299 + tex2D( YTex, input.uvCoords0 ).g * 0.587 + tex2D( YTex, input.uvCoords0 ).b * 0.114 ; float s = 0 ; if(iFlag == 0) { s = aValue * gray + bValue/255 ; } else if(iFlag == 1) { s = aValue * log(1+gray) ; } else if(iFlag == 2) { s = aValue * pow(abs(gray),bValue) ; } yuvColor.r = s ; yuvColor.g = s ; yuvColor.b = s ; yuvColor.a = 1.0 ; return yuvColor ; }
点运算如此简单是由于GPU是并行运算的,我我的认为能够当作是每个像素点(BGRA)对应一个线程,这大概就是OpenCL中所谓的数据并行。这是一个很是简单的程序,指令数少,程序结构也很简单,shader 的版本用2.0就能够轻松编过。.net
指令数较多的状况2.0版本的shader就搞不定了,上3.0版本能够作一些简单的卷积运算。以中值滤波为例:线程
texture Tex0 ; matrix WorldMatrix; matrix ViewMatrix; matrix ProjMatrix; sampler2D YTex = sampler_state { Texture = <Tex0> ; MipFilter = LINEAR ; MinFilter = LINEAR ; MagFilter = LINEAR ; AddressU = CLAMP ; AddressV = CLAMP ; }; struct VS_INPUT { float4 pos : POSITION; float4 color : COLOR0; float2 tex : TEXCOORD0; }; // struct VS_OUTPUT { float4 pos : POSITION; float4 color : COLOR0; float2 tex : TEXCOORD0; }; float2 g_v4ScreenSize; int ksize = 1 ; float fLeft = -1.0f ; float fTop = -1.0f ; float fRight = -1.0f ; float fBottom = -1.0f ; //--------------------------------- BurTechnique -------------------------------------- VS_OUTPUT MainVS_Screen( VS_INPUT In ) { VS_OUTPUT Out = ( VS_OUTPUT )0; float4x4 matWorldView = mul(WorldMatrix,ViewMatrix); float4x4 matProject = mul(matWorldView,ProjMatrix); Out.pos = mul(In.pos,matProject); Out.tex = In.tex; Out.color = In.color; return Out; } float4 MainPS_Screen( VS_INPUT In ) : COLOR0 { float4 outColor = tex2D( YTex, In.tex ).rgba ; if(ksize <= 1 || ksize%2 == 0) { return outColor ; } if( ksize > 11 || ksize < 3) { return outColor ; } if(!(In.tex.x < fRight && In.tex.y < fBottom && In.tex.x > fLeft && In.tex.y > fTop)) { return outColor ; } // 纹理大小 float2 TexSize = float2( g_v4ScreenSize.x , g_v4ScreenSize.y ); float x_off = 1.0f / TexSize.x; float y_off = 1.0f / TexSize.y; float2 fX0Y0 = In.tex - float2(x_off * ksize/2, y_off*ksize/2) ; float3 sum = {0.0f, 0.0f, 0.0f} ; if(ksize >= 3) { sum += tex2D( YTex , fX0Y0 + float2(x_off * 0, y_off*0)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 0, y_off*1)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 0, y_off*2)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 1, y_off*0)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 1, y_off*1)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 1, y_off*2)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 2, y_off*0)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 2, y_off*1)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 2, y_off*2)).rgb; } if(ksize >= 5) { sum += tex2D( YTex , fX0Y0 + float2(x_off * 3, y_off*0)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 3, y_off*1)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 3, y_off*2)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 3, y_off*3)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 3, y_off*4)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 4, y_off*0)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 4, y_off*1)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 4, y_off*2)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 4, y_off*3)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 4, y_off*4)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 0, y_off*3)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 1, y_off*3)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 2, y_off*3)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 0, y_off*4)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 1, y_off*4)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 2, y_off*4)).rgb; } if(ksize >= 7) { sum += tex2D( YTex , fX0Y0 + float2(x_off * 5, y_off*0)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 5, y_off*1)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 5, y_off*2)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 5, y_off*3)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 5, y_off*4)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 5, y_off*5)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 5, y_off*6)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 6, y_off*0)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 6, y_off*1)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 6, y_off*2)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 6, y_off*3)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 6, y_off*4)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 6, y_off*5)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 6, y_off*6)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 0, y_off*5)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 1, y_off*5)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 2, y_off*5)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 3, y_off*5)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 4, y_off*5)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 0, y_off*6)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 1, y_off*6)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 2, y_off*6)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 3, y_off*6)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 4, y_off*6)).rgb; } if(ksize >= 9) { sum += tex2D( YTex , fX0Y0 + float2(x_off * 7, y_off*0)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 7, y_off*1)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 7, y_off*2)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 7, y_off*3)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 7, y_off*4)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 7, y_off*5)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 7, y_off*6)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 7, y_off*7)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 7, y_off*8)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 8, y_off*0)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 8, y_off*1)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 8, y_off*2)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 8, y_off*3)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 8, y_off*4)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 8, y_off*5)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 8, y_off*6)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 8, y_off*7)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 8, y_off*8)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 0, y_off*7)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 1, y_off*7)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 2, y_off*7)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 3, y_off*7)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 4, y_off*7)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 5, y_off*7)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 6, y_off*7)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 0, y_off*8)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 1, y_off*8)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 2, y_off*8)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 3, y_off*8)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 4, y_off*8)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 5, y_off*8)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 6, y_off*8)).rgb; } if(ksize >= 11) { sum += tex2D( YTex , fX0Y0 + float2(x_off * 9, y_off*0)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 9, y_off*1)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 9, y_off*2)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 9, y_off*3)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 9, y_off*4)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 9, y_off*5)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 9, y_off*6)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 9, y_off*7)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 9, y_off*8)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 9, y_off*9)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 9, y_off*10)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 10, y_off*0)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 10, y_off*1)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 10, y_off*2)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 10, y_off*3)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 10, y_off*4)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 10, y_off*5)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 10, y_off*6)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 10, y_off*7)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 10, y_off*8)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 10, y_off*9)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 10, y_off*10)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 0, y_off*9)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 1, y_off*9)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 2, y_off*9)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 3, y_off*9)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 4, y_off*9)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 5, y_off*9)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 6, y_off*9)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 7, y_off*9)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 8, y_off*9)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 0, y_off*10)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 1, y_off*10)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 2, y_off*10)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 3, y_off*10)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 4, y_off*10)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 5, y_off*10)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 6, y_off*10)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 7, y_off*10)).rgb; sum += tex2D( YTex , fX0Y0 + float2(x_off * 8, y_off*10)).rgb; } outColor = float4(sum/(ksize*ksize),1.0f); return outColor ; } //--------------------------- 技术--------------------------- technique BurTechnique { pass P0 { LightEnable[0] = false; VertexShader = compile vs_3_0 MainVS_Screen(); PixelShader = compile ps_3_0 MainPS_Screen(); } }
因为3.0版本的shader彷佛不容许pixel shader单独出现,因此我从点运算用像素着色器实现改成用特效来实现。HLSL语法中有if语句,也有for语句,但是这个程序却不厌其烦的把全部的都给列出来来,而没有使用for循环。这是由于在实际使用中发现有一些限制,好比if语句的if(A>B),A与B中必须有一个是常量,就像上面见到的那种形式;for循环中间的判断也是如此,只是在第二层j循环中能够是第一层循环的i,即不能够code
for(int i=0;i<ksize;i++) { for(int j=0;j<ksize1;j++) { .......... } }
以上代码的ksize与ksize1都必须为常数,例外的状况是ksize1能够为第一层循环的 i 。这个问题不知道后续版本的shader有没有,反正我当前使用的版本有。视频
另外有一个须要注意的地方是指令数,2.0版本的shader支持的指令数至关少,3.0版本则要多好多,我最长写到了400多条快500条时才致使编译失败。 还有一个须要提醒的是3.0版本的shader只支持D3D 9.0C之后的。若是要求作更为复杂的图像处理,能够的话建议上D3D11,compute shader虽然我没用过,但从介绍来讲,应该能够处理一些更为复杂的图像处理。htm
结合个人上一篇博客(DXVA2解码数据用texture纹理渲染http://www.cnblogs.com/betterwgo/p/6327422.html),就算是实现了从硬解到简单图像处理的完整过程。显卡加速效果很是好,在个人intel 5200上即便4K视频也能够实现比正常播放略快的效果。blog
工程源码:http://download.csdn.net/download/qq_33892166/9755307ip